ZSI_Reconnect_China/WOS/wos_univ_normalizer.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "#  Importing libraries and module and some setting for notebook\n",
    "\n",
    "import pandas as pd\n",
    "import re\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "import numpy as np\n",
    "from scipy.sparse import csr_matrix\n",
    "import sparse_dot_topn.sparse_dot_topn as ct  #Cosine Similarity\n",
    "import time\n",
    "from tqdm import tqdm"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "outputs": [],
   "source": [
    "def wikinorm(univ_string):\n",
    "    from googlesearch import search\n",
    "    from nltk.metrics import edit_distance\n",
    "    from operator import itemgetter\n",
    "    from numpy.random import default_rng\n",
    "    rng = default_rng()\n",
    "    results = search(univ_string, lang=\"en\", num_results=3,advanced=True, sleep_interval=rng.uniform(1, 5))\n",
    "    univ_name = univ_string.split(\",\")[0]\n",
    "    u_results = [i.title for i in results if \"Category:\" not in i.title]\n",
    "    return sorted([tuple((j,edit_distance(univ_name, j))) for j in u_results],key=itemgetter(1))[0][0]\n"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "outputs": [],
   "source": [
    "def replace_uppercase_words(text):\n",
    "    words = text.split()\n",
    "    all_uppercase = all(word.isupper() for word in words)\n",
    "    all_lowercase = all(word.islower() for word in words)\n",
    "    if all_uppercase or all_lowercase:\n",
    "        return text\n",
    "    else:\n",
    "        result = []\n",
    "        for word in words:\n",
    "            w = word.strip()\n",
    "            if not w.isupper() and not w.islower():\n",
    "                result.append(w)\n",
    "        return \" \".join(result).strip()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO: Pandarallel will run on 4 workers.\n",
      "INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.\n",
      "\n",
      "WARNING: You are on Windows. If you detect any issue with pandarallel, be sure you checked out the Troubleshooting page:\n",
      "https://nalepae.github.io/pandarallel/troubleshooting/\n"
     ]
    },
    {
     "data": {
      "text/plain": "VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=44660), Label(value='0 / 44660')))…",
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "5f8bead5565146a5843c01b81b77cf9f"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "178638\n"
     ]
    }
   ],
   "source": [
    "outdir=\"wos_processed_data\"\n",
    "univ = pd.read_excel(f\"{outdir}/wos_institution_locations.xlsx\")\n",
    "\n",
    "from pandarallel import pandarallel\n",
    "pandarallel.initialize(progress_bar=True, nb_workers=4)\n",
    "\n",
    "univ[\"Institution_harm\"] = univ[\"Institution\"].parallel_apply(replace_uppercase_words)\n",
    "print(len(univ))"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "outputs": [
    {
     "data": {
      "text/plain": "         UT (Unique WOS ID)                           Institution   \n149037  WOS:000764953300001        Univ Elect Sci & Technol China  \\\n86834   WOS:000519526500027                 Radboud Univ Nijmegen   \n143915  WOS:000739917304088                Swiss Fed Inst Technol   \n135117  WOS:000707680800001  North China Elect Power Univ Beijing   \n110390  WOS:000605608700001                  Imperial Coll London   \n...                     ...                                   ...   \n21250   WOS:000358912300001                            Jilin Univ   \n23018   WOS:000364230600002                  Tampere Univ Technol   \n126847  WOS:000675855300001                       Univ Copenhagen   \n15313   WOS:000343701400001                            Univ Siena   \n77834   WOS:000490147400012                         Tsinghua Univ   \n\n               Country                      Institution_harm  \n149037           China        Univ Elect Sci & Technol China  \n86834      Netherlands                 Radboud Univ Nijmegen  \n143915     Switzerland                Swiss Fed Inst Technol  \n135117           China  North China Elect Power Univ Beijing  \n110390  United Kingdom                  Imperial Coll London  \n...                ...                                   ...  \n21250            China                            Jilin Univ  \n23018          Finland                  Tampere Univ Technol  \n126847         Denmark                       Univ Copenhagen  \n15313            Italy                            Univ Siena  \n77834            China                         Tsinghua Univ  \n\n[100 rows x 4 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>UT (Unique WOS ID)</th>\n      <th>Institution</th>\n      <th>Country</th>\n      <th>Institution_harm</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>149037</th>\n      <td>WOS:000764953300001</td>\n      <td>Univ Elect Sci &amp; Technol China</td>\n      <td>China</td>\n      <td>Univ Elect Sci &amp; Technol China</td>\n    </tr>\n    <tr>\n      <th>86834</th>\n      <td>WOS:000519526500027</td>\n      <td>Radboud Univ Nijmegen</td>\n      <td>Netherlands</td>\n      <td>Radboud Univ Nijmegen</td>\n    </tr>\n    <tr>\n      <th>143915</th>\n      <td>WOS:000739917304088</td>\n      <td>Swiss Fed Inst Technol</td>\n      <td>Switzerland</td>\n      <td>Swiss Fed Inst Technol</td>\n    </tr>\n    <tr>\n      <th>135117</th>\n      <td>WOS:000707680800001</td>\n      <td>North China Elect Power Univ Beijing</td>\n      <td>China</td>\n      <td>North China Elect Power Univ Beijing</td>\n    </tr>\n    <tr>\n      <th>110390</th>\n      <td>WOS:000605608700001</td>\n      <td>Imperial Coll London</td>\n      <td>United Kingdom</td>\n      <td>Imperial Coll London</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>21250</th>\n      <td>WOS:000358912300001</td>\n      <td>Jilin Univ</td>\n      <td>China</td>\n      <td>Jilin Univ</td>\n    </tr>\n    <tr>\n      <th>23018</th>\n      <td>WOS:000364230600002</td>\n      <td>Tampere Univ Technol</td>\n      <td>Finland</td>\n      <td>Tampere Univ Technol</td>\n    </tr>\n    <tr>\n      <th>126847</th>\n      <td>WOS:000675855300001</td>\n      <td>Univ Copenhagen</td>\n      <td>Denmark</td>\n      <td>Univ Copenhagen</td>\n    </tr>\n    <tr>\n      <th>15313</th>\n      <td>WOS:000343701400001</td>\n      <td>Univ Siena</td>\n      <td>Italy</td>\n      <td>Univ Siena</td>\n    </tr>\n    <tr>\n      <th>77834</th>\n      <td>WOS:000490147400012</td>\n      <td>Tsinghua Univ</td>\n      <td>China</td>\n      <td>Tsinghua Univ</td>\n    </tr>\n  </tbody>\n</table>\n<p>100 rows × 4 columns</p>\n</div>"
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "univ.sample(100)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "outputs": [
    {
     "data": {
      "text/plain": "              Country                                   Institution_harm   \n496           Belgium                        Haute Ecole Louvain Hainaut  \\\n10566         Germany                                                IQM   \n6670            China                                       Tiantan Hosp   \n16974     Switzerland        Species Survival Commiss Mushroom Bracket &   \n9200           France                               Hop Hotel Dieu Paris   \n...               ...                                                ...   \n11326         Germany         Int Max Planck Res Sch Earth Syst Modeling   \n2874            China                      China Natl Nucl Corp 416 Hosp   \n17967  United Kingdom                                    AccelerComm Ltd   \n14041     Netherlands                                   Vankeulen Advies   \n4148            China  First Peoples Hosp Jingmen Affiliated Hubei Minzu   \n\n       count  \n496        1  \n10566      4  \n6670       1  \n16974      1  \n9200       2  \n...      ...  \n11326      1  \n2874       2  \n17967      1  \n14041      1  \n4148       1  \n\n[100 rows x 3 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Country</th>\n      <th>Institution_harm</th>\n      <th>count</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>496</th>\n      <td>Belgium</td>\n      <td>Haute Ecole Louvain Hainaut</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>10566</th>\n      <td>Germany</td>\n      <td>IQM</td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <th>6670</th>\n      <td>China</td>\n      <td>Tiantan Hosp</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>16974</th>\n      <td>Switzerland</td>\n      <td>Species Survival Commiss Mushroom Bracket &amp;</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>9200</th>\n      <td>France</td>\n      <td>Hop Hotel Dieu Paris</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>11326</th>\n      <td>Germany</td>\n      <td>Int Max Planck Res Sch Earth Syst Modeling</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>2874</th>\n      <td>China</td>\n      <td>China Natl Nucl Corp 416 Hosp</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <th>17967</th>\n      <td>United Kingdom</td>\n      <td>AccelerComm Ltd</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>14041</th>\n      <td>Netherlands</td>\n      <td>Vankeulen Advies</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>4148</th>\n      <td>China</td>\n      <td>First Peoples Hosp Jingmen Affiliated Hubei Minzu</td>\n      <td>1</td>\n    </tr>\n  </tbody>\n</table>\n<p>100 rows × 3 columns</p>\n</div>"
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "univ_norm = univ.groupby(\"Country\", as_index=False)[\"Institution_harm\"].value_counts()\n",
    "# univ_norm[\"search_for\"] = univ_norm[\"Institution\"]+\", \" + univ_norm[\"Country\"]+ \", wikipedia\"\n",
    "univ_norm.sample(100)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "outputs": [],
   "source": [
    "# from pandarallel import pandarallel\n",
    "# pandarallel.initialize(progress_bar=True, nb_workers=2)\n",
    "#\n",
    "# df_sample[\"search_result\"] = df_sample[\"search_for\"].parallel_apply(wikinorm)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "outputs": [],
   "source": [
    "def ngrams(string, n=3):\n",
    "\n",
    "    string = re.sub(r'[,-./]|\\sBD',r'', string)\n",
    "    ngrams = zip(*[string[i:] for i in range(n)])\n",
    "    return [''.join(ngram) for ngram in ngrams]\n",
    "\n",
    "# calculate the similarity between two vectors of TF-IDF values the Cosine Similarity is usually used.\n",
    "# result matrix in a very sparse terms and Scikit-learn deals with this nicely by returning a sparse CSR matrix.\n",
    "\n",
    "def awesome_cossim_top(A, B, ntop, lower_bound=0):\n",
    "    # force A and B as a CSR matrix.\n",
    "    # If they have already been CSR, there is no overhead\n",
    "    A = A.tocsr()\n",
    "    B = B.tocsr()\n",
    "    M, _ = A.shape\n",
    "    _, N = B.shape\n",
    "\n",
    "    idx_dtype = np.int32\n",
    "\n",
    "    nnz_max = M*ntop\n",
    "\n",
    "    indptr = np.zeros(M+1, dtype=idx_dtype)\n",
    "    indices = np.zeros(nnz_max, dtype=idx_dtype)\n",
    "    data = np.zeros(nnz_max, dtype=A.dtype)\n",
    "\n",
    "    ct.sparse_dot_topn(\n",
    "        M, N, np.asarray(A.indptr, dtype=idx_dtype),\n",
    "        np.asarray(A.indices, dtype=idx_dtype),\n",
    "        A.data,\n",
    "        np.asarray(B.indptr, dtype=idx_dtype),\n",
    "        np.asarray(B.indices, dtype=idx_dtype),\n",
    "        B.data,\n",
    "        ntop,\n",
    "        lower_bound,\n",
    "        indptr, indices, data)\n",
    "\n",
    "    return csr_matrix((data,indices,indptr),shape=(M,N))\n",
    "\n",
    "# unpacks the resulting sparse matrix\n",
    "\n",
    "def get_matches_df(sparse_matrix, name_vector, top=None):\n",
    "    non_zeros = sparse_matrix.nonzero()\n",
    "\n",
    "    sparserows = non_zeros[0]\n",
    "    sparsecols = non_zeros[1]\n",
    "\n",
    "    if top:\n",
    "        nr_matches = top\n",
    "    else:\n",
    "        nr_matches = sparsecols.size\n",
    "\n",
    "    left_side = np.empty([nr_matches], dtype=object)\n",
    "    right_side = np.empty([nr_matches], dtype=object)\n",
    "    similarity = np.zeros(nr_matches)\n",
    "\n",
    "    for index in range(0, nr_matches):\n",
    "        left_side[index] = name_vector[sparserows[index]]\n",
    "        right_side[index] = name_vector[sparsecols[index]]\n",
    "        similarity[index] = sparse_matrix.data[index]\n",
    "\n",
    "    return pd.DataFrame({'left_side': left_side,\n",
    "                          'right_side': right_side,\n",
    "                           'similarity': similarity})\n",
    "\n",
    "\n",
    "def discrepancy_filter(df):\n",
    "    f_df = df.copy()\n",
    "    tokenlist = [\"Med\", \"Hosp\", \"Tech\", \"Univ\", \"Acad\", \"Poly\"]\n",
    "    for token in tokenlist:\n",
    "        f_df = f_df[~(((f_df[\"right_side\"].str.contains(token))&\n",
    "                       (~f_df[\"left_side\"].str.contains(token)))\n",
    "                      |\n",
    "                ((f_df[\"left_side\"].str.contains(token))&\n",
    "                 (~f_df[\"right_side\"].str.contains(token))))].copy()\n",
    "    return f_df\n",
    "\n",
    "\n",
    "# Define a function to get the high and low counts for each row\n",
    "def get_high_low_counts(row):\n",
    "    if row['left_count'] > row['right_count']:\n",
    "        high_count = row['left_count']\n",
    "        low_count = row['right_count']\n",
    "    else: #row['left_count'] < row['right_count']:\n",
    "        high_count = row['right_count']\n",
    "        low_count = row['left_count']\n",
    "    # else:\n",
    "    #     if len(row['left_side']) > len(row['right_side']):\n",
    "    #             high_count = len(row['left_side'])\n",
    "    #             low_count = len(row['right_side'])\n",
    "    #     else:\n",
    "    #             high_count = len(row['right_side'])\n",
    "    #             low_count = len(row['left_side'])\n",
    "    return pd.Series([high_count, low_count])"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 31/31 [00:01<00:00, 27.86it/s]\n"
     ]
    }
   ],
   "source": [
    "merger = pd.DataFrame()\n",
    "\n",
    "# for i in tqdm(filter(lambda c: c!=\"China\", list(univ_norm[\"Country\"].unique()))):\n",
    "for i in tqdm(list(univ_norm[\"Country\"].unique())):\n",
    "    sub_inst = univ_norm[univ_norm[\"Country\"]==i].reset_index()\n",
    "    types = sub_inst['Institution_harm']\n",
    "    vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)\n",
    "    tf_idf_matrix = vectorizer.fit_transform(types)\n",
    "    t1 = time.time()\n",
    "    matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.8 if i!=\"China\" else 0.9)\n",
    "    t = time.time()-t1\n",
    "\n",
    "    # store the  matches into new dataframe called matched_df and printing 10 samples\n",
    "    matches_df = get_matches_df(matches, types)\n",
    "    matches_df = matches_df[matches_df['similarity'] < 0.99999] # For removing all exact matches\n",
    "    matches_df = discrepancy_filter(matches_df).reset_index(drop=True)\n",
    "    matches_df[\"Country\"] = i\n",
    "    # matches_df = matches_df[pd.DataFrame(np.sort(matches_df[['left_side','right_side']].values,1)).duplicated()]\n",
    "    # matches_df = matches_df[~matches_df[['left_side', 'right_side']].apply(frozenset, axis=1).duplicated()]\n",
    "    merger = pd.concat([merger,matches_df], ignore_index=True)\n",
    "\n",
    "for s in [\"left\",\"right\"]:\n",
    "    merger[f\"{s}_count\"] = merger[f\"{s}_side\"].apply(lambda x: len(univ[univ[\"Institution_harm\"] == x]))\n",
    "\n",
    "# Apply the function to create a new column\n",
    "merger[['high_count', 'low_count']] = merger.apply(get_high_low_counts, axis=1)\n",
    "\n",
    "# Use apply again to create the high_side and low_side columns\n",
    "merger['high_side'] = merger.apply(lambda row: row['left_side'] if row['left_count'] > row['right_count'] else row['right_side'], axis=1)\n",
    "merger['low_side'] = merger.apply(lambda row: row['left_side'] if row['left_count'] <= row['right_count'] else row['right_side'], axis=1)\n",
    "\n",
    "# Drop the high_count and low_count columns if they are not needed\n",
    "# merger.drop(['high_count', 'low_count'], axis=1, inplace=True)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1916\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "1916it [01:14, 25.70it/s]\n"
     ]
    }
   ],
   "source": [
    "fuzzymerger = merger[[\"Country\",\"low_side\",\"high_side\",\"high_count\",\"low_count\",\"similarity\"]].drop_duplicates()\n",
    "fuzzymerger = fuzzymerger.sort_values(by=[\"low_side\",\"high_count\"], ascending=[True,False])\n",
    "fuzzymerger = fuzzymerger.drop_duplicates(subset=[\"Country\",\"low_side\"]).sort_values(by=\"high_count\", ascending=True).reset_index(drop=True)\n",
    "print(len(fuzzymerger))\n",
    "univ_harm = univ.copy()\n",
    "univ_harm[\"merge_iter\"] = 0\n",
    "for i,row in tqdm(fuzzymerger.iterrows()):\n",
    "    univ_harm.loc[((univ_harm[\"Country\"]==row[\"Country\"])&\n",
    "                   (univ_harm[\"Institution_harm\"]==row[\"low_side\"])),\"merge_iter\"] += 1\n",
    "    univ_harm.loc[((univ_harm[\"Country\"]==row[\"Country\"])&\n",
    "                   (univ_harm[\"Institution_harm\"]==row[\"low_side\"])),\"Institution_harm\"] = row[\"high_side\"]"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "outputs": [],
   "source": [
    "# fuzzymerger[fuzzymerger[\"Country\"]==\"China\"]"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "outputs": [],
   "source": [
    "# univ_harm[((univ_harm[\"merge_iter\"]>0) & (univ_harm[\"Country\"]==\"Italy\"))]"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "outputs": [],
   "source": [
    "univ_harm.loc[((univ_harm[\"merge_iter\"]>0) & (univ_harm[\"Country\"]==\"Italy\")&\n",
    "           (univ_harm[\"Institution\"].str.lower().str.contains(\"sapien\"))&\n",
    "            (univ_harm[\"Institution\"].str.lower().str.contains(\"univ\"))), \"Institution_harm\"] = \"Sapienza Univ Rome\""
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "outputs": [
    {
     "data": {
      "text/plain": "Institution         19821\nInstitution_harm    16646\ndtype: int64"
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "univ_harm[[\"Institution\",\"Institution_harm\"]].nunique()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "outputs": [
    {
     "data": {
      "text/plain": "         UT (Unique WOS ID)                 Institution         Country   \n124019  WOS:000663304500011        Excellium Consulting  United Kingdom  \\\n126511  WOS:000674472400006                 Tongji Univ           China   \n33359   WOS:000391252900006                Beihang Univ           China   \n153729  WOS:000787596500003                  Fudan Univ           China   \n69317   WOS:000467564700105      Guangdong Univ Technol           China   \n...                     ...                         ...             ...   \n160384  WOS:000812531900012              Zhengzhou Univ           China   \n133501  WOS:000702637000007             Univ Luxembourg      Luxembourg   \n140349  WOS:000728149000027  Regina Montis Regalis Hosp           Italy   \n160632  WOS:000813959600003         Liyang Peoples Hosp           China   \n174735  WOS:000888555700002      Swiss Fed Inst Technol     Switzerland   \n\n                  Institution_harm  merge_iter  \n124019        Excellium Consulting           0  \n126511                 Tongji Univ           0  \n33359                 Beihang Univ           0  \n153729                  Fudan Univ           0  \n69317       Guangdong Univ Technol           0  \n...                            ...         ...  \n160384              Zhengzhou Univ           0  \n133501             Univ Luxembourg           0  \n140349  Regina Montis Regalis Hosp           0  \n160632         Liyang Peoples Hosp           0  \n174735      Swiss Fed Inst Technol           0  \n\n[500 rows x 5 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>UT (Unique WOS ID)</th>\n      <th>Institution</th>\n      <th>Country</th>\n      <th>Institution_harm</th>\n      <th>merge_iter</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>124019</th>\n      <td>WOS:000663304500011</td>\n      <td>Excellium Consulting</td>\n      <td>United Kingdom</td>\n      <td>Excellium Consulting</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>126511</th>\n      <td>WOS:000674472400006</td>\n      <td>Tongji Univ</td>\n      <td>China</td>\n      <td>Tongji Univ</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>33359</th>\n      <td>WOS:000391252900006</td>\n      <td>Beihang Univ</td>\n      <td>China</td>\n      <td>Beihang Univ</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>153729</th>\n      <td>WOS:000787596500003</td>\n      <td>Fudan Univ</td>\n      <td>China</td>\n      <td>Fudan Univ</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>69317</th>\n      <td>WOS:000467564700105</td>\n      <td>Guangdong Univ Technol</td>\n      <td>China</td>\n      <td>Guangdong Univ Technol</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>160384</th>\n      <td>WOS:000812531900012</td>\n      <td>Zhengzhou Univ</td>\n      <td>China</td>\n      <td>Zhengzhou Univ</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>133501</th>\n      <td>WOS:000702637000007</td>\n      <td>Univ Luxembourg</td>\n      <td>Luxembourg</td>\n      <td>Univ Luxembourg</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>140349</th>\n      <td>WOS:000728149000027</td>\n      <td>Regina Montis Regalis Hosp</td>\n      <td>Italy</td>\n      <td>Regina Montis Regalis Hosp</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>160632</th>\n      <td>WOS:000813959600003</td>\n      <td>Liyang Peoples Hosp</td>\n      <td>China</td>\n      <td>Liyang Peoples Hosp</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>174735</th>\n      <td>WOS:000888555700002</td>\n      <td>Swiss Fed Inst Technol</td>\n      <td>Switzerland</td>\n      <td>Swiss Fed Inst Technol</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>\n<p>500 rows × 5 columns</p>\n</div>"
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "univ_harm.sample(500)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "outputs": [
    {
     "data": {
      "text/plain": "merge_iter\n0    174128\n1      3966\n2       521\n3        22\n4         1\nName: count, dtype: int64"
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "univ_harm[\"merge_iter\"].value_counts()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "outputs": [],
   "source": [
    "univ_harm.to_excel(f\"{outdir}/wos_institution_locations_harmonized.xlsx\", index=False)"
   ],
   "metadata": {
    "collapsed": false
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}