ZSI_Reconnect_China/WOS/wos_univ_normalizer.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 191,
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "#  Importing libraries and module and some setting for notebook\n",
    "\n",
    "import pandas as pd\n",
    "import re\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "import numpy as np\n",
    "from scipy.sparse import csr_matrix\n",
    "import sparse_dot_topn.sparse_dot_topn as ct  #Cosine Similarity\n",
    "import time\n",
    "from tqdm import tqdm"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 192,
   "outputs": [],
   "source": [
    "def wikinorm(univ_string):\n",
    "    from googlesearch import search\n",
    "    from nltk.metrics import edit_distance\n",
    "    from operator import itemgetter\n",
    "    from numpy.random import default_rng\n",
    "    rng = default_rng()\n",
    "    results = search(univ_string, lang=\"en\", num_results=3,advanced=True, sleep_interval=rng.uniform(1, 5))\n",
    "    univ_name = univ_string.split(\",\")[0]\n",
    "    u_results = [i.title for i in results if \"Category:\" not in i.title]\n",
    "    return sorted([tuple((j,edit_distance(univ_name, j))) for j in u_results],key=itemgetter(1))[0][0]\n"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 193,
   "outputs": [],
   "source": [
    "def replace_uppercase_words(text):\n",
    "    words = text.split()\n",
    "    all_uppercase = all(word.isupper() for word in words)\n",
    "    all_lowercase = all(word.islower() for word in words)\n",
    "    if all_uppercase or all_lowercase:\n",
    "        return text\n",
    "    else:\n",
    "        result = []\n",
    "        for word in words:\n",
    "            w = word.strip()\n",
    "            if not w.isupper() and not w.islower():\n",
    "                result.append(w)\n",
    "        return \" \".join(result).strip()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 194,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO: Pandarallel will run on 4 workers.\n",
      "INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.\n",
      "\n",
      "WARNING: You are on Windows. If you detect any issue with pandarallel, be sure you checked out the Troubleshooting page:\n",
      "https://nalepae.github.io/pandarallel/troubleshooting/\n"
     ]
    },
    {
     "data": {
      "text/plain": "VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=38767), Label(value='0 / 38767')))…",
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "8551fdcfc52a43108a78c1e91915c681"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "155067\n"
     ]
    }
   ],
   "source": [
    "outdir=\"wos_processed_data\"\n",
    "univ = pd.read_excel(f\"{outdir}/wos_institution_locations.xlsx\")\n",
    "\n",
    "from pandarallel import pandarallel\n",
    "pandarallel.initialize(progress_bar=True, nb_workers=4)\n",
    "\n",
    "univ[\"Institution_harm\"] = univ[\"Institution\"].parallel_apply(replace_uppercase_words)\n",
    "print(len(univ))"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 195,
   "outputs": [
    {
     "data": {
      "text/plain": "         UT (Unique WOS ID)                          Institution   \n1094    WOS:000292330300050              Hong Kong Polytech Univ  \\\n21547   WOS:000374363900001               Guangdong Univ Technol   \n53778   WOS:000459846300019                          Aarhus Univ   \n153776  WOS:000907044000014                           Univ Siena   \n81562   WOS:000554591602038  China Natl Elect Import Export Corp   \n...                     ...                                  ...   \n29206   WOS:000397047200002                  Univ Duisburg Essen   \n21658   WOS:000374617600020                     Univ Southampton   \n43289   WOS:000434742800004                     Univ Strathclyde   \n37200   WOS:000418525100013                Goethe Univ Frankfurt   \n95964   WOS:000616310200013               Eindhoven Univ Technol   \n\n               Country                     Institution_harm  \n1094             China              Hong Kong Polytech Univ  \n21547            China               Guangdong Univ Technol  \n53778          Denmark                          Aarhus Univ  \n153776           Italy                           Univ Siena  \n81562            China  China Natl Elect Import Export Corp  \n...                ...                                  ...  \n29206          Germany                  Univ Duisburg Essen  \n21658   United Kingdom                     Univ Southampton  \n43289   United Kingdom                     Univ Strathclyde  \n37200          Germany                Goethe Univ Frankfurt  \n95964      Netherlands               Eindhoven Univ Technol  \n\n[100 rows x 4 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>UT (Unique WOS ID)</th>\n      <th>Institution</th>\n      <th>Country</th>\n      <th>Institution_harm</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>1094</th>\n      <td>WOS:000292330300050</td>\n      <td>Hong Kong Polytech Univ</td>\n      <td>China</td>\n      <td>Hong Kong Polytech Univ</td>\n    </tr>\n    <tr>\n      <th>21547</th>\n      <td>WOS:000374363900001</td>\n      <td>Guangdong Univ Technol</td>\n      <td>China</td>\n      <td>Guangdong Univ Technol</td>\n    </tr>\n    <tr>\n      <th>53778</th>\n      <td>WOS:000459846300019</td>\n      <td>Aarhus Univ</td>\n      <td>Denmark</td>\n      <td>Aarhus Univ</td>\n    </tr>\n    <tr>\n      <th>153776</th>\n      <td>WOS:000907044000014</td>\n      <td>Univ Siena</td>\n      <td>Italy</td>\n      <td>Univ Siena</td>\n    </tr>\n    <tr>\n      <th>81562</th>\n      <td>WOS:000554591602038</td>\n      <td>China Natl Elect Import Export Corp</td>\n      <td>China</td>\n      <td>China Natl Elect Import Export Corp</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>29206</th>\n      <td>WOS:000397047200002</td>\n      <td>Univ Duisburg Essen</td>\n      <td>Germany</td>\n      <td>Univ Duisburg Essen</td>\n    </tr>\n    <tr>\n      <th>21658</th>\n      <td>WOS:000374617600020</td>\n      <td>Univ Southampton</td>\n      <td>United Kingdom</td>\n      <td>Univ Southampton</td>\n    </tr>\n    <tr>\n      <th>43289</th>\n      <td>WOS:000434742800004</td>\n      <td>Univ Strathclyde</td>\n      <td>United Kingdom</td>\n      <td>Univ Strathclyde</td>\n    </tr>\n    <tr>\n      <th>37200</th>\n      <td>WOS:000418525100013</td>\n      <td>Goethe Univ Frankfurt</td>\n      <td>Germany</td>\n      <td>Goethe Univ Frankfurt</td>\n    </tr>\n    <tr>\n      <th>95964</th>\n      <td>WOS:000616310200013</td>\n      <td>Eindhoven Univ Technol</td>\n      <td>Netherlands</td>\n      <td>Eindhoven Univ Technol</td>\n    </tr>\n  </tbody>\n</table>\n<p>100 rows × 4 columns</p>\n</div>"
     },
     "execution_count": 195,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "univ.sample(100)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 196,
   "outputs": [
    {
     "data": {
      "text/plain": "        Country                  Institution_harm  count\n12655    Poland                     Space Res Ctr      6\n12940  Portugal     Ctr Invest Energia State Grid      1\n616       China              Minist Nat Resources     78\n5561      China  PowerChina Huadong Engn Corp Ltd      1\n514       China                    Chongqing Univ    478\n...         ...                               ...    ...\n476    Bulgaria                         Tech Univ      1\n12454    Norway               Stavanger Univ Hosp      9\n5489      China               Shanghai Sports Sch      1\n768       China                        Hubei Univ     25\n13527     Spain            Jimenez Diaz Univ Hosp      2\n\n[100 rows x 3 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Country</th>\n      <th>Institution_harm</th>\n      <th>count</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>12655</th>\n      <td>Poland</td>\n      <td>Space Res Ctr</td>\n      <td>6</td>\n    </tr>\n    <tr>\n      <th>12940</th>\n      <td>Portugal</td>\n      <td>Ctr Invest Energia State Grid</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>616</th>\n      <td>China</td>\n      <td>Minist Nat Resources</td>\n      <td>78</td>\n    </tr>\n    <tr>\n      <th>5561</th>\n      <td>China</td>\n      <td>PowerChina Huadong Engn Corp Ltd</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>514</th>\n      <td>China</td>\n      <td>Chongqing Univ</td>\n      <td>478</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>476</th>\n      <td>Bulgaria</td>\n      <td>Tech Univ</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>12454</th>\n      <td>Norway</td>\n      <td>Stavanger Univ Hosp</td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <th>5489</th>\n      <td>China</td>\n      <td>Shanghai Sports Sch</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>768</th>\n      <td>China</td>\n      <td>Hubei Univ</td>\n      <td>25</td>\n    </tr>\n    <tr>\n      <th>13527</th>\n      <td>Spain</td>\n      <td>Jimenez Diaz Univ Hosp</td>\n      <td>2</td>\n    </tr>\n  </tbody>\n</table>\n<p>100 rows × 3 columns</p>\n</div>"
     },
     "execution_count": 196,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "univ_norm = univ.groupby(\"Country\", as_index=False)[\"Institution_harm\"].value_counts()\n",
    "# univ_norm[\"search_for\"] = univ_norm[\"Institution\"]+\", \" + univ_norm[\"Country\"]+ \", wikipedia\"\n",
    "univ_norm.sample(100)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 197,
   "outputs": [],
   "source": [
    "# from pandarallel import pandarallel\n",
    "# pandarallel.initialize(progress_bar=True, nb_workers=2)\n",
    "#\n",
    "# df_sample[\"search_result\"] = df_sample[\"search_for\"].parallel_apply(wikinorm)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 198,
   "outputs": [],
   "source": [
    "def ngrams(string, n=3):\n",
    "\n",
    "    string = re.sub(r'[,-./]|\\sBD',r'', string)\n",
    "    ngrams = zip(*[string[i:] for i in range(n)])\n",
    "    return [''.join(ngram) for ngram in ngrams]\n",
    "\n",
    "# calculate the similarity between two vectors of TF-IDF values the Cosine Similarity is usually used.\n",
    "# result matrix in a very sparse terms and Scikit-learn deals with this nicely by returning a sparse CSR matrix.\n",
    "\n",
    "def awesome_cossim_top(A, B, ntop, lower_bound=0):\n",
    "    # force A and B as a CSR matrix.\n",
    "    # If they have already been CSR, there is no overhead\n",
    "    A = A.tocsr()\n",
    "    B = B.tocsr()\n",
    "    M, _ = A.shape\n",
    "    _, N = B.shape\n",
    "\n",
    "    idx_dtype = np.int32\n",
    "\n",
    "    nnz_max = M*ntop\n",
    "\n",
    "    indptr = np.zeros(M+1, dtype=idx_dtype)\n",
    "    indices = np.zeros(nnz_max, dtype=idx_dtype)\n",
    "    data = np.zeros(nnz_max, dtype=A.dtype)\n",
    "\n",
    "    ct.sparse_dot_topn(\n",
    "        M, N, np.asarray(A.indptr, dtype=idx_dtype),\n",
    "        np.asarray(A.indices, dtype=idx_dtype),\n",
    "        A.data,\n",
    "        np.asarray(B.indptr, dtype=idx_dtype),\n",
    "        np.asarray(B.indices, dtype=idx_dtype),\n",
    "        B.data,\n",
    "        ntop,\n",
    "        lower_bound,\n",
    "        indptr, indices, data)\n",
    "\n",
    "    return csr_matrix((data,indices,indptr),shape=(M,N))\n",
    "\n",
    "# unpacks the resulting sparse matrix\n",
    "\n",
    "def get_matches_df(sparse_matrix, name_vector, top=None):\n",
    "    non_zeros = sparse_matrix.nonzero()\n",
    "\n",
    "    sparserows = non_zeros[0]\n",
    "    sparsecols = non_zeros[1]\n",
    "\n",
    "    if top:\n",
    "        nr_matches = top\n",
    "    else:\n",
    "        nr_matches = sparsecols.size\n",
    "\n",
    "    left_side = np.empty([nr_matches], dtype=object)\n",
    "    right_side = np.empty([nr_matches], dtype=object)\n",
    "    similarity = np.zeros(nr_matches)\n",
    "\n",
    "    for index in range(0, nr_matches):\n",
    "        left_side[index] = name_vector[sparserows[index]]\n",
    "        right_side[index] = name_vector[sparsecols[index]]\n",
    "        similarity[index] = sparse_matrix.data[index]\n",
    "\n",
    "    return pd.DataFrame({'left_side': left_side,\n",
    "                          'right_side': right_side,\n",
    "                           'similarity': similarity})\n",
    "\n",
    "\n",
    "def discrepancy_filter(df):\n",
    "    f_df = df.copy()\n",
    "    tokenlist = [\"Med\", \"Hosp\", \"Tech\", \"Univ\", \"Acad\", \"Poly\"]\n",
    "    for token in tokenlist:\n",
    "        f_df = f_df[~(((f_df[\"right_side\"].str.contains(token))&\n",
    "                       (~f_df[\"left_side\"].str.contains(token)))\n",
    "                      |\n",
    "                ((f_df[\"left_side\"].str.contains(token))&\n",
    "                 (~f_df[\"right_side\"].str.contains(token))))].copy()\n",
    "    return f_df\n",
    "\n",
    "\n",
    "# Define a function to get the high and low counts for each row\n",
    "def get_high_low_counts(row):\n",
    "    if row['left_count'] > row['right_count']:\n",
    "        high_count = row['left_count']\n",
    "        low_count = row['right_count']\n",
    "    else: #row['left_count'] < row['right_count']:\n",
    "        high_count = row['right_count']\n",
    "        low_count = row['left_count']\n",
    "    # else:\n",
    "    #     if len(row['left_side']) > len(row['right_side']):\n",
    "    #             high_count = len(row['left_side'])\n",
    "    #             low_count = len(row['right_side'])\n",
    "    #     else:\n",
    "    #             high_count = len(row['right_side'])\n",
    "    #             low_count = len(row['left_side'])\n",
    "    return pd.Series([high_count, low_count])"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 199,
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 31/31 [00:00<00:00, 32.89it/s]\n"
     ]
    }
   ],
   "source": [
    "merger = pd.DataFrame()\n",
    "\n",
    "# for i in tqdm(filter(lambda c: c!=\"China\", list(univ_norm[\"Country\"].unique()))):\n",
    "for i in tqdm(list(univ_norm[\"Country\"].unique())):\n",
    "    sub_inst = univ_norm[univ_norm[\"Country\"]==i].reset_index()\n",
    "    types = sub_inst['Institution_harm']\n",
    "    vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)\n",
    "    tf_idf_matrix = vectorizer.fit_transform(types)\n",
    "    t1 = time.time()\n",
    "    matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.8 if i!=\"China\" else 0.9)\n",
    "    t = time.time()-t1\n",
    "\n",
    "    # store the  matches into new dataframe called matched_df and printing 10 samples\n",
    "    matches_df = get_matches_df(matches, types)\n",
    "    matches_df = matches_df[matches_df['similarity'] < 0.99999] # For removing all exact matches\n",
    "    matches_df = discrepancy_filter(matches_df).reset_index(drop=True)\n",
    "    matches_df[\"Country\"] = i\n",
    "    # matches_df = matches_df[pd.DataFrame(np.sort(matches_df[['left_side','right_side']].values,1)).duplicated()]\n",
    "    # matches_df = matches_df[~matches_df[['left_side', 'right_side']].apply(frozenset, axis=1).duplicated()]\n",
    "    merger = pd.concat([merger,matches_df], ignore_index=True)\n",
    "\n",
    "for s in [\"left\",\"right\"]:\n",
    "    merger[f\"{s}_count\"] = merger[f\"{s}_side\"].apply(lambda x: len(univ[univ[\"Institution_harm\"] == x]))\n",
    "\n",
    "# Apply the function to create a new column\n",
    "merger[['high_count', 'low_count']] = merger.apply(get_high_low_counts, axis=1)\n",
    "\n",
    "# Use apply again to create the high_side and low_side columns\n",
    "merger['high_side'] = merger.apply(lambda row: row['left_side'] if row['left_count'] > row['right_count'] else row['right_side'], axis=1)\n",
    "merger['low_side'] = merger.apply(lambda row: row['left_side'] if row['left_count'] <= row['right_count'] else row['right_side'], axis=1)\n",
    "\n",
    "# Drop the high_count and low_count columns if they are not needed\n",
    "# merger.drop(['high_count', 'low_count'], axis=1, inplace=True)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 200,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1538\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "1538it [01:04, 23.79it/s]\n"
     ]
    }
   ],
   "source": [
    "fuzzymerger = merger[[\"Country\",\"low_side\",\"high_side\",\"high_count\",\"low_count\",\"similarity\"]].drop_duplicates()\n",
    "fuzzymerger = fuzzymerger.sort_values(by=[\"low_side\",\"high_count\"], ascending=[True,False])\n",
    "fuzzymerger = fuzzymerger.drop_duplicates(subset=[\"Country\",\"low_side\"]).sort_values(by=\"high_count\", ascending=True).reset_index(drop=True)\n",
    "print(len(fuzzymerger))\n",
    "univ_harm = univ.copy()\n",
    "univ_harm[\"merge_iter\"] = 0\n",
    "for i,row in tqdm(fuzzymerger.iterrows()):\n",
    "    univ_harm.loc[((univ_harm[\"Country\"]==row[\"Country\"])&\n",
    "                   (univ_harm[\"Institution_harm\"]==row[\"low_side\"])),\"merge_iter\"] += 1\n",
    "    univ_harm.loc[((univ_harm[\"Country\"]==row[\"Country\"])&\n",
    "                   (univ_harm[\"Institution_harm\"]==row[\"low_side\"])),\"Institution_harm\"] = row[\"high_side\"]"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 201,
   "outputs": [
    {
     "data": {
      "text/plain": "     Country                                         low_side   \n0      China  Logist Univ Chinese Peoples Armed Police Forces  \\\n9      China                  Flight Automat Control Res Inst   \n10     China         Northwest Elect Power Design Inst Co Ltd   \n11     China   Northwest Elect Power Design Inst Co Ltd China   \n12     China           Northwest Inst Ecoenvironm & Resources   \n...      ...                                              ...   \n1531   China                      Chinese Univ Hong Kong Hong   \n1532   China                       Huazhong Univ Sci & Techno   \n1533   China                Hong Kong Polytech Univ Hong Kong   \n1534   China                          Kong Kong Polytech Univ   \n1537   China                    Univ Elect Sci & Technol Chin   \n\n                                              high_side  high_count   \n0        Logist Univ Chinese Peoples Armed Police Force           1  \\\n9                  Xian Flight Automat Control Res Inst           1   \n10       Northwest Elect Power Design Inst Co Ltd China           1   \n11             Northwest Elect Power Design Inst Co Ltd           1   \n12    Northwest Inst Ecoenvironm & Resources Chinese Ac           1   \n...                                                 ...         ...   \n1531                             Chinese Univ Hong Kong         728   \n1532                        Huazhong Univ Sci & Technol         729   \n1533                            Hong Kong Polytech Univ         809   \n1534                            Hong Kong Polytech Univ         809   \n1537                     Univ Elect Sci & Technol China        1076   \n\n      low_count  similarity  \n0             1    0.988072  \n9             1    0.905747  \n10            1    0.926984  \n11            1    0.926984  \n12            1    0.910630  \n...         ...         ...  \n1531          1    0.935944  \n1532          1    0.989260  \n1533          1    0.917345  \n1534          1    0.939416  \n1537          1    0.983258  \n\n[346 rows x 6 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Country</th>\n      <th>low_side</th>\n      <th>high_side</th>\n      <th>high_count</th>\n      <th>low_count</th>\n      <th>similarity</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>China</td>\n      <td>Logist Univ Chinese Peoples Armed Police Forces</td>\n      <td>Logist Univ Chinese Peoples Armed Police Force</td>\n      <td>1</td>\n      <td>1</td>\n      <td>0.988072</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>China</td>\n      <td>Flight Automat Control Res Inst</td>\n      <td>Xian Flight Automat Control Res Inst</td>\n      <td>1</td>\n      <td>1</td>\n      <td>0.905747</td>\n    </tr>\n    <tr>\n      <th>10</th>\n      <td>China</td>\n      <td>Northwest Elect Power Design Inst Co Ltd</td>\n      <td>Northwest Elect Power Design Inst Co Ltd China</td>\n      <td>1</td>\n      <td>1</td>\n      <td>0.926984</td>\n    </tr>\n    <tr>\n      <th>11</th>\n      <td>China</td>\n      <td>Northwest Elect Power Design Inst Co Ltd China</td>\n      <td>Northwest Elect Power Design Inst Co Ltd</td>\n      <td>1</td>\n      <td>1</td>\n      <td>0.926984</td>\n    </tr>\n    <tr>\n      <th>12</th>\n      <td>China</td>\n      <td>Northwest Inst Ecoenvironm &amp; Resources</td>\n      <td>Northwest Inst Ecoenvironm &amp; Resources Chinese Ac</td>\n      <td>1</td>\n      <td>1</td>\n      <td>0.910630</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>1531</th>\n      <td>China</td>\n      <td>Chinese Univ Hong Kong Hong</td>\n      <td>Chinese Univ Hong Kong</td>\n      <td>728</td>\n      <td>1</td>\n      <td>0.935944</td>\n    </tr>\n    <tr>\n      <th>1532</th>\n      <td>China</td>\n      <td>Huazhong Univ Sci &amp; Techno</td>\n      <td>Huazhong Univ Sci &amp; Technol</td>\n      <td>729</td>\n      <td>1</td>\n      <td>0.989260</td>\n    </tr>\n    <tr>\n      <th>1533</th>\n      <td>China</td>\n      <td>Hong Kong Polytech Univ Hong Kong</td>\n      <td>Hong Kong Polytech Univ</td>\n      <td>809</td>\n      <td>1</td>\n      <td>0.917345</td>\n    </tr>\n    <tr>\n      <th>1534</th>\n      <td>China</td>\n      <td>Kong Kong Polytech Univ</td>\n      <td>Hong Kong Polytech Univ</td>\n      <td>809</td>\n      <td>1</td>\n      <td>0.939416</td>\n    </tr>\n    <tr>\n      <th>1537</th>\n      <td>China</td>\n      <td>Univ Elect Sci &amp; Technol Chin</td>\n      <td>Univ Elect Sci &amp; Technol China</td>\n      <td>1076</td>\n      <td>1</td>\n      <td>0.983258</td>\n    </tr>\n  </tbody>\n</table>\n<p>346 rows × 6 columns</p>\n</div>"
     },
     "execution_count": 201,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# fuzzymerger[fuzzymerger[\"Country\"]==\"China\"]"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 202,
   "outputs": [
    {
     "data": {
      "text/plain": "         UT (Unique WOS ID)                           Institution Country   \n244     WOS:000286472300003                            Univ Trent   Italy  \\\n364     WOS:000287586100011                            Univ Trent   Italy   \n410     WOS:000287939200011      Abdus Salam Int Ctr Theoret Phys   Italy   \n765     WOS:000290996200002                            Univ Trent   Italy   \n907     WOS:000291698400013                       INFN Sez Roma 1   Italy   \n...                     ...                                   ...     ...   \n153063  WOS:000900129900175        Univ Rome Campus Biomed Aquila   Italy   \n154775  WOS:000929737300001                    Prevent & Res Inst   Italy   \n154813  WOS:000929737300001                       Ist Super Sanit   Italy   \n154855  WOS:000933331200004                       Univ Federio II   Italy   \n154857  WOS:000933331200004  INAF Osservatorio Astron Capodimonte   Italy   \n\n                         Institution_harm  merge_iter  \n244                           Univ Trento           1  \n364                           Univ Trento           1  \n410     Abdus Salaam Int Ctr Theoret Phys           1  \n765                           Univ Trento           1  \n907                              Sez Roma           1  \n...                                   ...         ...  \n153063     Univ Rome Campus Biomed Aquila           2  \n154775                 Prevent & Res Inst           2  \n154813                   Ist Super Sanita           1  \n154855                      Univ Federico           1  \n154857          Osserv Astron Capodimonte           1  \n\n[375 rows x 5 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>UT (Unique WOS ID)</th>\n      <th>Institution</th>\n      <th>Country</th>\n      <th>Institution_harm</th>\n      <th>merge_iter</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>244</th>\n      <td>WOS:000286472300003</td>\n      <td>Univ Trent</td>\n      <td>Italy</td>\n      <td>Univ Trento</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>364</th>\n      <td>WOS:000287586100011</td>\n      <td>Univ Trent</td>\n      <td>Italy</td>\n      <td>Univ Trento</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>410</th>\n      <td>WOS:000287939200011</td>\n      <td>Abdus Salam Int Ctr Theoret Phys</td>\n      <td>Italy</td>\n      <td>Abdus Salaam Int Ctr Theoret Phys</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>765</th>\n      <td>WOS:000290996200002</td>\n      <td>Univ Trent</td>\n      <td>Italy</td>\n      <td>Univ Trento</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>907</th>\n      <td>WOS:000291698400013</td>\n      <td>INFN Sez Roma 1</td>\n      <td>Italy</td>\n      <td>Sez Roma</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>153063</th>\n      <td>WOS:000900129900175</td>\n      <td>Univ Rome Campus Biomed Aquila</td>\n      <td>Italy</td>\n      <td>Univ Rome Campus Biomed Aquila</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <th>154775</th>\n      <td>WOS:000929737300001</td>\n      <td>Prevent &amp; Res Inst</td>\n      <td>Italy</td>\n      <td>Prevent &amp; Res Inst</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <th>154813</th>\n      <td>WOS:000929737300001</td>\n      <td>Ist Super Sanit</td>\n      <td>Italy</td>\n      <td>Ist Super Sanita</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>154855</th>\n      <td>WOS:000933331200004</td>\n      <td>Univ Federio II</td>\n      <td>Italy</td>\n      <td>Univ Federico</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>154857</th>\n      <td>WOS:000933331200004</td>\n      <td>INAF Osservatorio Astron Capodimonte</td>\n      <td>Italy</td>\n      <td>Osserv Astron Capodimonte</td>\n      <td>1</td>\n    </tr>\n  </tbody>\n</table>\n<p>375 rows × 5 columns</p>\n</div>"
     },
     "execution_count": 202,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# univ_harm[((univ_harm[\"merge_iter\"]>0) & (univ_harm[\"Country\"]==\"Italy\"))]"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 208,
   "outputs": [],
   "source": [
    "univ_harm.loc[((univ_harm[\"merge_iter\"]>0) & (univ_harm[\"Country\"]==\"Italy\")&\n",
    "           (univ_harm[\"Institution\"].str.lower().str.contains(\"sapien\"))&\n",
    "            (univ_harm[\"Institution\"].str.lower().str.contains(\"univ\"))), \"Institution_harm\"] = \"Sapienza Univ Rome\""
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 209,
   "outputs": [
    {
     "data": {
      "text/plain": "Institution         17083\nInstitution_harm    14449\ndtype: int64"
     },
     "execution_count": 209,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "univ_harm[[\"Institution\",\"Institution_harm\"]].nunique()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 210,
   "outputs": [],
   "source": [
    "univ_harm.to_excel(f\"{outdir}/wos_institution_locations_harmonized.xlsx\", index=False)"
   ],
   "metadata": {
    "collapsed": false
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}