ZSI_Reconnect_China/WOS/wos_univ_normalizer.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "#  Importing libraries and module and some setting for notebook\n",
    "\n",
    "import pandas as pd\n",
    "import re\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "import numpy as np\n",
    "from scipy.sparse import csr_matrix\n",
    "import sparse_dot_topn.sparse_dot_topn as ct  #Cosine Similarity\n",
    "import time\n",
    "from tqdm import tqdm"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "outputs": [],
   "source": [
    "def wikinorm(univ_string):\n",
    "    from googlesearch import search\n",
    "    from nltk.metrics import edit_distance\n",
    "    from operator import itemgetter\n",
    "    from numpy.random import default_rng\n",
    "    rng = default_rng()\n",
    "    results = search(univ_string, lang=\"en\", num_results=3,advanced=True, sleep_interval=rng.uniform(1, 5))\n",
    "    univ_name = univ_string.split(\",\")[0]\n",
    "    u_results = [i.title for i in results if \"Category:\" not in i.title]\n",
    "    return sorted([tuple((j,edit_distance(univ_name, j))) for j in u_results],key=itemgetter(1))[0][0]\n"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "outputs": [],
   "source": [
    "def replace_uppercase_words(text):\n",
    "    words = text.split()\n",
    "    all_uppercase = all(word.isupper() for word in words)\n",
    "    all_lowercase = all(word.islower() for word in words)\n",
    "    if all_uppercase or all_lowercase:\n",
    "        return text\n",
    "    else:\n",
    "        result = []\n",
    "        for word in words:\n",
    "            w = word.strip()\n",
    "            if not w.isupper() and not w.islower():\n",
    "                result.append(w)\n",
    "        return \" \".join(result).strip()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO: Pandarallel will run on 4 workers.\n",
      "INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.\n",
      "\n",
      "WARNING: You are on Windows. If you detect any issue with pandarallel, be sure you checked out the Troubleshooting page:\n",
      "https://nalepae.github.io/pandarallel/troubleshooting/\n"
     ]
    },
    {
     "data": {
      "text/plain": "VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=44660), Label(value='0 / 44660')))…",
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "92c1cd6c14644ffeb042b38f5d5d98c5"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "178638\n"
     ]
    }
   ],
   "source": [
    "outdir=\"wos_processed_data\"\n",
    "univ = pd.read_excel(f\"{outdir}/wos_institution_locations.xlsx\")\n",
    "\n",
    "from pandarallel import pandarallel\n",
    "pandarallel.initialize(progress_bar=True, nb_workers=4)\n",
    "\n",
    "univ[\"Institution_harm\"] = univ[\"Institution\"].parallel_apply(replace_uppercase_words)\n",
    "print(len(univ))"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "outputs": [
    {
     "data": {
      "text/plain": "         UT (Unique WOS ID)                        Institution   \n153271  WOS:000784587900008                          Univ Pisa  \\\n159800  WOS:000810042500002        China Japan Friendship Hosp   \n130931  WOS:000691922800007            Karl Franzens Univ Graz   \n1500    WOS:000292944600012                                CNR   \n113964  WOS:000618210000032               Karolinska Univ Hosp   \n...                     ...                                ...   \n160284  WOS:000812227000009        Univ Appl Sci Upper Austria   \n29314   WOS:000381396400013                   Univ Southampton   \n17045   WOS:000347046200017                Charles Univ Prague   \n164118  WOS:000832954200001  Nanjing Univ Aeronaut & Astronaut   \n109992  WOS:000604257500070             KTH Royal Inst Technol   \n\n               Country                   Institution_harm  \n153271           Italy                          Univ Pisa  \n159800           China        China Japan Friendship Hosp  \n130931         Austria            Karl Franzens Univ Graz  \n1500             Italy                                CNR  \n113964          Sweden               Karolinska Univ Hosp  \n...                ...                                ...  \n160284         Austria        Univ Appl Sci Upper Austria  \n29314   United Kingdom                   Univ Southampton  \n17045   Czech Republic                Charles Univ Prague  \n164118           China  Nanjing Univ Aeronaut & Astronaut  \n109992          Sweden                 Royal Inst Technol  \n\n[100 rows x 4 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>UT (Unique WOS ID)</th>\n      <th>Institution</th>\n      <th>Country</th>\n      <th>Institution_harm</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>153271</th>\n      <td>WOS:000784587900008</td>\n      <td>Univ Pisa</td>\n      <td>Italy</td>\n      <td>Univ Pisa</td>\n    </tr>\n    <tr>\n      <th>159800</th>\n      <td>WOS:000810042500002</td>\n      <td>China Japan Friendship Hosp</td>\n      <td>China</td>\n      <td>China Japan Friendship Hosp</td>\n    </tr>\n    <tr>\n      <th>130931</th>\n      <td>WOS:000691922800007</td>\n      <td>Karl Franzens Univ Graz</td>\n      <td>Austria</td>\n      <td>Karl Franzens Univ Graz</td>\n    </tr>\n    <tr>\n      <th>1500</th>\n      <td>WOS:000292944600012</td>\n      <td>CNR</td>\n      <td>Italy</td>\n      <td>CNR</td>\n    </tr>\n    <tr>\n      <th>113964</th>\n      <td>WOS:000618210000032</td>\n      <td>Karolinska Univ Hosp</td>\n      <td>Sweden</td>\n      <td>Karolinska Univ Hosp</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>160284</th>\n      <td>WOS:000812227000009</td>\n      <td>Univ Appl Sci Upper Austria</td>\n      <td>Austria</td>\n      <td>Univ Appl Sci Upper Austria</td>\n    </tr>\n    <tr>\n      <th>29314</th>\n      <td>WOS:000381396400013</td>\n      <td>Univ Southampton</td>\n      <td>United Kingdom</td>\n      <td>Univ Southampton</td>\n    </tr>\n    <tr>\n      <th>17045</th>\n      <td>WOS:000347046200017</td>\n      <td>Charles Univ Prague</td>\n      <td>Czech Republic</td>\n      <td>Charles Univ Prague</td>\n    </tr>\n    <tr>\n      <th>164118</th>\n      <td>WOS:000832954200001</td>\n      <td>Nanjing Univ Aeronaut &amp; Astronaut</td>\n      <td>China</td>\n      <td>Nanjing Univ Aeronaut &amp; Astronaut</td>\n    </tr>\n    <tr>\n      <th>109992</th>\n      <td>WOS:000604257500070</td>\n      <td>KTH Royal Inst Technol</td>\n      <td>Sweden</td>\n      <td>Royal Inst Technol</td>\n    </tr>\n  </tbody>\n</table>\n<p>100 rows × 4 columns</p>\n</div>"
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "univ.sample(100)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "outputs": [
    {
     "data": {
      "text/plain": "              Country                          Institution_harm  count\n8168          Croatia        Inst Adriat Crops & Karst Reclamat      1\n3417            China                         Ctr Eye & Vis Res      1\n1034            China                   Westlake Inst Adv Study     13\n13427           Italy                             Macerata Hosp      1\n8071            China  Key Lab Ecoind Green Technol Fujian Prov      1\n...               ...                                       ...    ...\n17230  United Kingdom                             Univ Kingston      6\n8847           France                               Univ Artois      8\n16071           Spain          Catalonia Geriatr & Gerontol Soc      1\n6357            China              Wuxi Huace Elect Syst Co Ltd      1\n9049           France                      Excelia Business Sch      3\n\n[100 rows x 3 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Country</th>\n      <th>Institution_harm</th>\n      <th>count</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>8168</th>\n      <td>Croatia</td>\n      <td>Inst Adriat Crops &amp; Karst Reclamat</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>3417</th>\n      <td>China</td>\n      <td>Ctr Eye &amp; Vis Res</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>1034</th>\n      <td>China</td>\n      <td>Westlake Inst Adv Study</td>\n      <td>13</td>\n    </tr>\n    <tr>\n      <th>13427</th>\n      <td>Italy</td>\n      <td>Macerata Hosp</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>8071</th>\n      <td>China</td>\n      <td>Key Lab Ecoind Green Technol Fujian Prov</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>17230</th>\n      <td>United Kingdom</td>\n      <td>Univ Kingston</td>\n      <td>6</td>\n    </tr>\n    <tr>\n      <th>8847</th>\n      <td>France</td>\n      <td>Univ Artois</td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <th>16071</th>\n      <td>Spain</td>\n      <td>Catalonia Geriatr &amp; Gerontol Soc</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>6357</th>\n      <td>China</td>\n      <td>Wuxi Huace Elect Syst Co Ltd</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>9049</th>\n      <td>France</td>\n      <td>Excelia Business Sch</td>\n      <td>3</td>\n    </tr>\n  </tbody>\n</table>\n<p>100 rows × 3 columns</p>\n</div>"
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "univ_norm = univ.groupby(\"Country\", as_index=False)[\"Institution_harm\"].value_counts()\n",
    "# univ_norm[\"search_for\"] = univ_norm[\"Institution\"]+\", \" + univ_norm[\"Country\"]+ \", wikipedia\"\n",
    "univ_norm.sample(100)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "outputs": [],
   "source": [
    "# from pandarallel import pandarallel\n",
    "# pandarallel.initialize(progress_bar=True, nb_workers=2)\n",
    "#\n",
    "# df_sample[\"search_result\"] = df_sample[\"search_for\"].parallel_apply(wikinorm)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "outputs": [],
   "source": [
    "def ngrams(string, n=3):\n",
    "\n",
    "    string = re.sub(r'[,-./]|\\sBD',r'', string)\n",
    "    ngrams = zip(*[string[i:] for i in range(n)])\n",
    "    return [''.join(ngram) for ngram in ngrams]\n",
    "\n",
    "# calculate the similarity between two vectors of TF-IDF values the Cosine Similarity is usually used.\n",
    "# result matrix in a very sparse terms and Scikit-learn deals with this nicely by returning a sparse CSR matrix.\n",
    "\n",
    "def awesome_cossim_top(A, B, ntop, lower_bound=0):\n",
    "    # force A and B as a CSR matrix.\n",
    "    # If they have already been CSR, there is no overhead\n",
    "    A = A.tocsr()\n",
    "    B = B.tocsr()\n",
    "    M, _ = A.shape\n",
    "    _, N = B.shape\n",
    "\n",
    "    idx_dtype = np.int32\n",
    "\n",
    "    nnz_max = M*ntop\n",
    "\n",
    "    indptr = np.zeros(M+1, dtype=idx_dtype)\n",
    "    indices = np.zeros(nnz_max, dtype=idx_dtype)\n",
    "    data = np.zeros(nnz_max, dtype=A.dtype)\n",
    "\n",
    "    ct.sparse_dot_topn(\n",
    "        M, N, np.asarray(A.indptr, dtype=idx_dtype),\n",
    "        np.asarray(A.indices, dtype=idx_dtype),\n",
    "        A.data,\n",
    "        np.asarray(B.indptr, dtype=idx_dtype),\n",
    "        np.asarray(B.indices, dtype=idx_dtype),\n",
    "        B.data,\n",
    "        ntop,\n",
    "        lower_bound,\n",
    "        indptr, indices, data)\n",
    "\n",
    "    return csr_matrix((data,indices,indptr),shape=(M,N))\n",
    "\n",
    "# unpacks the resulting sparse matrix\n",
    "\n",
    "def get_matches_df(sparse_matrix, name_vector, top=None):\n",
    "    non_zeros = sparse_matrix.nonzero()\n",
    "\n",
    "    sparserows = non_zeros[0]\n",
    "    sparsecols = non_zeros[1]\n",
    "\n",
    "    if top:\n",
    "        nr_matches = top\n",
    "    else:\n",
    "        nr_matches = sparsecols.size\n",
    "\n",
    "    left_side = np.empty([nr_matches], dtype=object)\n",
    "    right_side = np.empty([nr_matches], dtype=object)\n",
    "    similarity = np.zeros(nr_matches)\n",
    "\n",
    "    for index in range(0, nr_matches):\n",
    "        left_side[index] = name_vector[sparserows[index]]\n",
    "        right_side[index] = name_vector[sparsecols[index]]\n",
    "        similarity[index] = sparse_matrix.data[index]\n",
    "\n",
    "    return pd.DataFrame({'left_side': left_side,\n",
    "                          'right_side': right_side,\n",
    "                           'similarity': similarity})\n",
    "\n",
    "\n",
    "def discrepancy_filter(df):\n",
    "    f_df = df.copy()\n",
    "    tokenlist = [\"Med\", \"Hosp\", \"Tech\", \"Univ\", \"Acad\", \"Poly\"]\n",
    "    for token in tokenlist:\n",
    "        f_df = f_df[~(((f_df[\"right_side\"].str.contains(token))&\n",
    "                       (~f_df[\"left_side\"].str.contains(token)))\n",
    "                      |\n",
    "                ((f_df[\"left_side\"].str.contains(token))&\n",
    "                 (~f_df[\"right_side\"].str.contains(token))))].copy()\n",
    "    return f_df\n",
    "\n",
    "\n",
    "# Define a function to get the high and low counts for each row\n",
    "def get_high_low_counts(row):\n",
    "    if row['left_count'] > row['right_count']:\n",
    "        high_count = row['left_count']\n",
    "        low_count = row['right_count']\n",
    "    else: #row['left_count'] < row['right_count']:\n",
    "        high_count = row['right_count']\n",
    "        low_count = row['left_count']\n",
    "    # else:\n",
    "    #     if len(row['left_side']) > len(row['right_side']):\n",
    "    #             high_count = len(row['left_side'])\n",
    "    #             low_count = len(row['right_side'])\n",
    "    #     else:\n",
    "    #             high_count = len(row['right_side'])\n",
    "    #             low_count = len(row['left_side'])\n",
    "    return pd.Series([high_count, low_count])"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 31/31 [00:00<00:00, 31.97it/s]\n"
     ]
    }
   ],
   "source": [
    "merger = pd.DataFrame()\n",
    "\n",
    "# for i in tqdm(filter(lambda c: c!=\"China\", list(univ_norm[\"Country\"].unique()))):\n",
    "for i in tqdm(list(univ_norm[\"Country\"].unique())):\n",
    "    sub_inst = univ_norm[univ_norm[\"Country\"]==i].reset_index()\n",
    "    types = sub_inst['Institution_harm']\n",
    "    vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)\n",
    "    tf_idf_matrix = vectorizer.fit_transform(types)\n",
    "    t1 = time.time()\n",
    "    matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.8 if i!=\"China\" else 0.9)\n",
    "    t = time.time()-t1\n",
    "\n",
    "    # store the  matches into new dataframe called matched_df and printing 10 samples\n",
    "    matches_df = get_matches_df(matches, types)\n",
    "    matches_df = matches_df[matches_df['similarity'] < 0.99999] # For removing all exact matches\n",
    "    matches_df = discrepancy_filter(matches_df).reset_index(drop=True)\n",
    "    matches_df[\"Country\"] = i\n",
    "    # matches_df = matches_df[pd.DataFrame(np.sort(matches_df[['left_side','right_side']].values,1)).duplicated()]\n",
    "    # matches_df = matches_df[~matches_df[['left_side', 'right_side']].apply(frozenset, axis=1).duplicated()]\n",
    "    merger = pd.concat([merger,matches_df], ignore_index=True)\n",
    "\n",
    "for s in [\"left\",\"right\"]:\n",
    "    merger[f\"{s}_count\"] = merger[f\"{s}_side\"].apply(lambda x: len(univ[univ[\"Institution_harm\"] == x]))\n",
    "\n",
    "# Apply the function to create a new column\n",
    "merger[['high_count', 'low_count']] = merger.apply(get_high_low_counts, axis=1)\n",
    "\n",
    "# Use apply again to create the high_side and low_side columns\n",
    "merger['high_side'] = merger.apply(lambda row: row['left_side'] if row['left_count'] > row['right_count'] else row['right_side'], axis=1)\n",
    "merger['low_side'] = merger.apply(lambda row: row['left_side'] if row['left_count'] <= row['right_count'] else row['right_side'], axis=1)\n",
    "\n",
    "# Drop the high_count and low_count columns if they are not needed\n",
    "# merger.drop(['high_count', 'low_count'], axis=1, inplace=True)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1916\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "1916it [01:11, 26.94it/s]\n"
     ]
    }
   ],
   "source": [
    "fuzzymerger = merger[[\"Country\",\"low_side\",\"high_side\",\"high_count\",\"low_count\",\"similarity\"]].drop_duplicates()\n",
    "fuzzymerger = fuzzymerger.sort_values(by=[\"low_side\",\"high_count\"], ascending=[True,False])\n",
    "fuzzymerger = fuzzymerger.drop_duplicates(subset=[\"Country\",\"low_side\"]).sort_values(by=\"high_count\", ascending=True).reset_index(drop=True)\n",
    "print(len(fuzzymerger))\n",
    "univ_harm = univ.copy()\n",
    "univ_harm[\"merge_iter\"] = 0\n",
    "for i,row in tqdm(fuzzymerger.iterrows()):\n",
    "    univ_harm.loc[((univ_harm[\"Country\"]==row[\"Country\"])&\n",
    "                   (univ_harm[\"Institution_harm\"]==row[\"low_side\"])),\"merge_iter\"] += 1\n",
    "    univ_harm.loc[((univ_harm[\"Country\"]==row[\"Country\"])&\n",
    "                   (univ_harm[\"Institution_harm\"]==row[\"low_side\"])),\"Institution_harm\"] = row[\"high_side\"]"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "outputs": [],
   "source": [
    "# fuzzymerger[fuzzymerger[\"Country\"]==\"China\"]"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "outputs": [],
   "source": [
    "# univ_harm[((univ_harm[\"merge_iter\"]>0) & (univ_harm[\"Country\"]==\"Italy\"))]"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "outputs": [],
   "source": [
    "univ_harm.loc[((univ_harm[\"merge_iter\"]>0) & (univ_harm[\"Country\"]==\"Italy\")&\n",
    "           (univ_harm[\"Institution\"].str.lower().str.contains(\"sapien\"))&\n",
    "            (univ_harm[\"Institution\"].str.lower().str.contains(\"univ\"))), \"Institution_harm\"] = \"Sapienza Univ Rome\""
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "outputs": [
    {
     "data": {
      "text/plain": "Institution         19821\nInstitution_harm    16646\ndtype: int64"
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "univ_harm[[\"Institution\",\"Institution_harm\"]].nunique()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "outputs": [
    {
     "data": {
      "text/plain": "         UT (Unique WOS ID)                   Institution         Country   \n49282   WOS:000428099700011                Univ Sheffield  United Kingdom  \\\n51975   WOS:000432981300002              Chinese Acad Sci           China   \n64618   WOS:000459693000011             Babes Bolyai Univ         Romania   \n163145  WOS:000828102100001                   Xidian Univ           China   \n99690   WOS:000566510600001           Fora Forest Technol           Spain   \n...                     ...                           ...             ...   \n1567    WOS:000293492500004                    Univ Essex  United Kingdom   \n73076   WOS:000476471800022                 Shanghai Univ           China   \n137096  WOS:000715426400001               Queen Mary Hosp           China   \n164978  WOS:000836819000003  Manchester Metropolitan Univ  United Kingdom   \n32973   WOS:000390181300013       Univ Complutense Madrid           Spain   \n\n                    Institution_harm  merge_iter  \n49282                 Univ Sheffield           0  \n51975               Chinese Acad Sci           0  \n64618              Babes Bolyai Univ           0  \n163145                   Xidian Univ           0  \n99690            Fora Forest Technol           0  \n...                              ...         ...  \n1567                      Univ Essex           0  \n73076                  Shanghai Univ           0  \n137096               Queen Mary Hosp           0  \n164978  Manchester Metropolitan Univ           0  \n32973        Univ Complutense Madrid           0  \n\n[500 rows x 5 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>UT (Unique WOS ID)</th>\n      <th>Institution</th>\n      <th>Country</th>\n      <th>Institution_harm</th>\n      <th>merge_iter</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>49282</th>\n      <td>WOS:000428099700011</td>\n      <td>Univ Sheffield</td>\n      <td>United Kingdom</td>\n      <td>Univ Sheffield</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>51975</th>\n      <td>WOS:000432981300002</td>\n      <td>Chinese Acad Sci</td>\n      <td>China</td>\n      <td>Chinese Acad Sci</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>64618</th>\n      <td>WOS:000459693000011</td>\n      <td>Babes Bolyai Univ</td>\n      <td>Romania</td>\n      <td>Babes Bolyai Univ</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>163145</th>\n      <td>WOS:000828102100001</td>\n      <td>Xidian Univ</td>\n      <td>China</td>\n      <td>Xidian Univ</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>99690</th>\n      <td>WOS:000566510600001</td>\n      <td>Fora Forest Technol</td>\n      <td>Spain</td>\n      <td>Fora Forest Technol</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>1567</th>\n      <td>WOS:000293492500004</td>\n      <td>Univ Essex</td>\n      <td>United Kingdom</td>\n      <td>Univ Essex</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>73076</th>\n      <td>WOS:000476471800022</td>\n      <td>Shanghai Univ</td>\n      <td>China</td>\n      <td>Shanghai Univ</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>137096</th>\n      <td>WOS:000715426400001</td>\n      <td>Queen Mary Hosp</td>\n      <td>China</td>\n      <td>Queen Mary Hosp</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>164978</th>\n      <td>WOS:000836819000003</td>\n      <td>Manchester Metropolitan Univ</td>\n      <td>United Kingdom</td>\n      <td>Manchester Metropolitan Univ</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>32973</th>\n      <td>WOS:000390181300013</td>\n      <td>Univ Complutense Madrid</td>\n      <td>Spain</td>\n      <td>Univ Complutense Madrid</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>\n<p>500 rows × 5 columns</p>\n</div>"
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "univ_harm.sample(500)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "outputs": [],
   "source": [
    "univ_harm.to_excel(f\"{outdir}/wos_institution_locations_harmonized.xlsx\", index=False)"
   ],
   "metadata": {
    "collapsed": false
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}