You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ZSI_Reconnect_China/WOS/wos_univ_normalizer.ipynb

508 lines
26 KiB
Plaintext

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"outputs": [],
"source": [
"import pandas as pd\n",
"# Importing libraries and module and some setting for notebook\n",
"\n",
"import pandas as pd\n",
"import re\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"import numpy as np\n",
"from scipy.sparse import csr_matrix\n",
"import sparse_dot_topn.sparse_dot_topn as ct #Cosine Similarity\n",
"import time\n",
"from tqdm import tqdm"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [],
"source": [
"def wikinorm(univ_string):\n",
" from googlesearch import search\n",
" from nltk.metrics import edit_distance\n",
" from operator import itemgetter\n",
" from numpy.random import default_rng\n",
" rng = default_rng()\n",
" results = search(univ_string, lang=\"en\", num_results=3,advanced=True, sleep_interval=rng.uniform(1, 5))\n",
" univ_name = univ_string.split(\",\")[0]\n",
" u_results = [i.title for i in results if \"Category:\" not in i.title]\n",
" return sorted([tuple((j,edit_distance(univ_name, j))) for j in u_results],key=itemgetter(1))[0][0]\n"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 3,
"outputs": [],
"source": [
"def replace_uppercase_words(text):\n",
" words = text.split()\n",
" all_uppercase = all(word.isupper() for word in words)\n",
" all_lowercase = all(word.islower() for word in words)\n",
" if all_uppercase or all_lowercase:\n",
" return text\n",
" else:\n",
" result = []\n",
" for word in words:\n",
" w = word.strip()\n",
" if not w.isupper() and not w.islower():\n",
" result.append(w)\n",
" return \" \".join(result).strip()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 4,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO: Pandarallel will run on 4 workers.\n",
"INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.\n",
"\n",
"WARNING: You are on Windows. If you detect any issue with pandarallel, be sure you checked out the Troubleshooting page:\n",
"https://nalepae.github.io/pandarallel/troubleshooting/\n"
]
},
{
"data": {
"text/plain": "VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=44660), Label(value='0 / 44660')))…",
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "5f8bead5565146a5843c01b81b77cf9f"
}
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"178638\n"
]
}
],
"source": [
"outdir=\"wos_processed_data\"\n",
"univ = pd.read_excel(f\"{outdir}/wos_institution_locations.xlsx\")\n",
"\n",
"from pandarallel import pandarallel\n",
"pandarallel.initialize(progress_bar=True, nb_workers=4)\n",
"\n",
"univ[\"Institution_harm\"] = univ[\"Institution\"].parallel_apply(replace_uppercase_words)\n",
"print(len(univ))"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 5,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Institution \n149037 WOS:000764953300001 Univ Elect Sci & Technol China \\\n86834 WOS:000519526500027 Radboud Univ Nijmegen \n143915 WOS:000739917304088 Swiss Fed Inst Technol \n135117 WOS:000707680800001 North China Elect Power Univ Beijing \n110390 WOS:000605608700001 Imperial Coll London \n... ... ... \n21250 WOS:000358912300001 Jilin Univ \n23018 WOS:000364230600002 Tampere Univ Technol \n126847 WOS:000675855300001 Univ Copenhagen \n15313 WOS:000343701400001 Univ Siena \n77834 WOS:000490147400012 Tsinghua Univ \n\n Country Institution_harm \n149037 China Univ Elect Sci & Technol China \n86834 Netherlands Radboud Univ Nijmegen \n143915 Switzerland Swiss Fed Inst Technol \n135117 China North China Elect Power Univ Beijing \n110390 United Kingdom Imperial Coll London \n... ... ... \n21250 China Jilin Univ \n23018 Finland Tampere Univ Technol \n126847 Denmark Univ Copenhagen \n15313 Italy Univ Siena \n77834 China Tsinghua Univ \n\n[100 rows x 4 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Institution</th>\n <th>Country</th>\n <th>Institution_harm</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>149037</th>\n <td>WOS:000764953300001</td>\n <td>Univ Elect Sci &amp; Technol China</td>\n <td>China</td>\n <td>Univ Elect Sci &amp; Technol China</td>\n </tr>\n <tr>\n <th>86834</th>\n <td>WOS:000519526500027</td>\n <td>Radboud Univ Nijmegen</td>\n <td>Netherlands</td>\n <td>Radboud Univ Nijmegen</td>\n </tr>\n <tr>\n <th>143915</th>\n <td>WOS:000739917304088</td>\n <td>Swiss Fed Inst Technol</td>\n <td>Switzerland</td>\n <td>Swiss Fed Inst Technol</td>\n </tr>\n <tr>\n <th>135117</th>\n <td>WOS:000707680800001</td>\n <td>North China Elect Power Univ Beijing</td>\n <td>China</td>\n <td>North China Elect Power Univ Beijing</td>\n </tr>\n <tr>\n <th>110390</th>\n <td>WOS:000605608700001</td>\n <td>Imperial Coll London</td>\n <td>United Kingdom</td>\n <td>Imperial Coll London</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>21250</th>\n <td>WOS:000358912300001</td>\n <td>Jilin Univ</td>\n <td>China</td>\n <td>Jilin Univ</td>\n </tr>\n <tr>\n <th>23018</th>\n <td>WOS:000364230600002</td>\n <td>Tampere Univ Technol</td>\n <td>Finland</td>\n <td>Tampere Univ Technol</td>\n </tr>\n <tr>\n <th>126847</th>\n <td>WOS:000675855300001</td>\n <td>Univ Copenhagen</td>\n <td>Denmark</td>\n <td>Univ Copenhagen</td>\n </tr>\n <tr>\n <th>15313</th>\n <td>WOS:000343701400001</td>\n <td>Univ Siena</td>\n <td>Italy</td>\n <td>Univ Siena</td>\n </tr>\n <tr>\n <th>77834</th>\n <td>WOS:000490147400012</td>\n <td>Tsinghua Univ</td>\n <td>China</td>\n <td>Tsinghua Univ</td>\n </tr>\n </tbody>\n</table>\n<p>100 rows × 4 columns</p>\n</div>"
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"univ.sample(100)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 6,
"outputs": [
{
"data": {
"text/plain": " Country Institution_harm \n496 Belgium Haute Ecole Louvain Hainaut \\\n10566 Germany IQM \n6670 China Tiantan Hosp \n16974 Switzerland Species Survival Commiss Mushroom Bracket & \n9200 France Hop Hotel Dieu Paris \n... ... ... \n11326 Germany Int Max Planck Res Sch Earth Syst Modeling \n2874 China China Natl Nucl Corp 416 Hosp \n17967 United Kingdom AccelerComm Ltd \n14041 Netherlands Vankeulen Advies \n4148 China First Peoples Hosp Jingmen Affiliated Hubei Minzu \n\n count \n496 1 \n10566 4 \n6670 1 \n16974 1 \n9200 2 \n... ... \n11326 1 \n2874 2 \n17967 1 \n14041 1 \n4148 1 \n\n[100 rows x 3 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Country</th>\n <th>Institution_harm</th>\n <th>count</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>496</th>\n <td>Belgium</td>\n <td>Haute Ecole Louvain Hainaut</td>\n <td>1</td>\n </tr>\n <tr>\n <th>10566</th>\n <td>Germany</td>\n <td>IQM</td>\n <td>4</td>\n </tr>\n <tr>\n <th>6670</th>\n <td>China</td>\n <td>Tiantan Hosp</td>\n <td>1</td>\n </tr>\n <tr>\n <th>16974</th>\n <td>Switzerland</td>\n <td>Species Survival Commiss Mushroom Bracket &amp;</td>\n <td>1</td>\n </tr>\n <tr>\n <th>9200</th>\n <td>France</td>\n <td>Hop Hotel Dieu Paris</td>\n <td>2</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>11326</th>\n <td>Germany</td>\n <td>Int Max Planck Res Sch Earth Syst Modeling</td>\n <td>1</td>\n </tr>\n <tr>\n <th>2874</th>\n <td>China</td>\n <td>China Natl Nucl Corp 416 Hosp</td>\n <td>2</td>\n </tr>\n <tr>\n <th>17967</th>\n <td>United Kingdom</td>\n <td>AccelerComm Ltd</td>\n <td>1</td>\n </tr>\n <tr>\n <th>14041</th>\n <td>Netherlands</td>\n <td>Vankeulen Advies</td>\n <td>1</td>\n </tr>\n <tr>\n <th>4148</th>\n <td>China</td>\n <td>First Peoples Hosp Jingmen Affiliated Hubei Minzu</td>\n <td>1</td>\n </tr>\n </tbody>\n</table>\n<p>100 rows × 3 columns</p>\n</div>"
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"univ_norm = univ.groupby(\"Country\", as_index=False)[\"Institution_harm\"].value_counts()\n",
"# univ_norm[\"search_for\"] = univ_norm[\"Institution\"]+\", \" + univ_norm[\"Country\"]+ \", wikipedia\"\n",
"univ_norm.sample(100)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 7,
"outputs": [],
"source": [
"# from pandarallel import pandarallel\n",
"# pandarallel.initialize(progress_bar=True, nb_workers=2)\n",
"#\n",
"# df_sample[\"search_result\"] = df_sample[\"search_for\"].parallel_apply(wikinorm)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 8,
"outputs": [],
"source": [
"def ngrams(string, n=3):\n",
"\n",
" string = re.sub(r'[,-./]|\\sBD',r'', string)\n",
" ngrams = zip(*[string[i:] for i in range(n)])\n",
" return [''.join(ngram) for ngram in ngrams]\n",
"\n",
"# calculate the similarity between two vectors of TF-IDF values the Cosine Similarity is usually used.\n",
"# result matrix in a very sparse terms and Scikit-learn deals with this nicely by returning a sparse CSR matrix.\n",
"\n",
"def awesome_cossim_top(A, B, ntop, lower_bound=0):\n",
" # force A and B as a CSR matrix.\n",
" # If they have already been CSR, there is no overhead\n",
" A = A.tocsr()\n",
" B = B.tocsr()\n",
" M, _ = A.shape\n",
" _, N = B.shape\n",
"\n",
" idx_dtype = np.int32\n",
"\n",
" nnz_max = M*ntop\n",
"\n",
" indptr = np.zeros(M+1, dtype=idx_dtype)\n",
" indices = np.zeros(nnz_max, dtype=idx_dtype)\n",
" data = np.zeros(nnz_max, dtype=A.dtype)\n",
"\n",
" ct.sparse_dot_topn(\n",
" M, N, np.asarray(A.indptr, dtype=idx_dtype),\n",
" np.asarray(A.indices, dtype=idx_dtype),\n",
" A.data,\n",
" np.asarray(B.indptr, dtype=idx_dtype),\n",
" np.asarray(B.indices, dtype=idx_dtype),\n",
" B.data,\n",
" ntop,\n",
" lower_bound,\n",
" indptr, indices, data)\n",
"\n",
" return csr_matrix((data,indices,indptr),shape=(M,N))\n",
"\n",
"# unpacks the resulting sparse matrix\n",
"\n",
"def get_matches_df(sparse_matrix, name_vector, top=None):\n",
" non_zeros = sparse_matrix.nonzero()\n",
"\n",
" sparserows = non_zeros[0]\n",
" sparsecols = non_zeros[1]\n",
"\n",
" if top:\n",
" nr_matches = top\n",
" else:\n",
" nr_matches = sparsecols.size\n",
"\n",
" left_side = np.empty([nr_matches], dtype=object)\n",
" right_side = np.empty([nr_matches], dtype=object)\n",
" similarity = np.zeros(nr_matches)\n",
"\n",
" for index in range(0, nr_matches):\n",
" left_side[index] = name_vector[sparserows[index]]\n",
" right_side[index] = name_vector[sparsecols[index]]\n",
" similarity[index] = sparse_matrix.data[index]\n",
"\n",
" return pd.DataFrame({'left_side': left_side,\n",
" 'right_side': right_side,\n",
" 'similarity': similarity})\n",
"\n",
"\n",
"def discrepancy_filter(df):\n",
" f_df = df.copy()\n",
" tokenlist = [\"Med\", \"Hosp\", \"Tech\", \"Univ\", \"Acad\", \"Poly\"]\n",
" for token in tokenlist:\n",
" f_df = f_df[~(((f_df[\"right_side\"].str.contains(token))&\n",
" (~f_df[\"left_side\"].str.contains(token)))\n",
" |\n",
" ((f_df[\"left_side\"].str.contains(token))&\n",
" (~f_df[\"right_side\"].str.contains(token))))].copy()\n",
" return f_df\n",
"\n",
"\n",
"# Define a function to get the high and low counts for each row\n",
"def get_high_low_counts(row):\n",
" if row['left_count'] > row['right_count']:\n",
" high_count = row['left_count']\n",
" low_count = row['right_count']\n",
" else: #row['left_count'] < row['right_count']:\n",
" high_count = row['right_count']\n",
" low_count = row['left_count']\n",
" # else:\n",
" # if len(row['left_side']) > len(row['right_side']):\n",
" # high_count = len(row['left_side'])\n",
" # low_count = len(row['right_side'])\n",
" # else:\n",
" # high_count = len(row['right_side'])\n",
" # low_count = len(row['left_side'])\n",
" return pd.Series([high_count, low_count])"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 9,
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 31/31 [00:01<00:00, 27.86it/s]\n"
]
}
],
"source": [
"merger = pd.DataFrame()\n",
"\n",
"# for i in tqdm(filter(lambda c: c!=\"China\", list(univ_norm[\"Country\"].unique()))):\n",
"for i in tqdm(list(univ_norm[\"Country\"].unique())):\n",
" sub_inst = univ_norm[univ_norm[\"Country\"]==i].reset_index()\n",
" types = sub_inst['Institution_harm']\n",
" vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)\n",
" tf_idf_matrix = vectorizer.fit_transform(types)\n",
" t1 = time.time()\n",
" matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.8 if i!=\"China\" else 0.9)\n",
" t = time.time()-t1\n",
"\n",
" # store the matches into new dataframe called matched_df and printing 10 samples\n",
" matches_df = get_matches_df(matches, types)\n",
" matches_df = matches_df[matches_df['similarity'] < 0.99999] # For removing all exact matches\n",
" matches_df = discrepancy_filter(matches_df).reset_index(drop=True)\n",
" matches_df[\"Country\"] = i\n",
" # matches_df = matches_df[pd.DataFrame(np.sort(matches_df[['left_side','right_side']].values,1)).duplicated()]\n",
" # matches_df = matches_df[~matches_df[['left_side', 'right_side']].apply(frozenset, axis=1).duplicated()]\n",
" merger = pd.concat([merger,matches_df], ignore_index=True)\n",
"\n",
"for s in [\"left\",\"right\"]:\n",
" merger[f\"{s}_count\"] = merger[f\"{s}_side\"].apply(lambda x: len(univ[univ[\"Institution_harm\"] == x]))\n",
"\n",
"# Apply the function to create a new column\n",
"merger[['high_count', 'low_count']] = merger.apply(get_high_low_counts, axis=1)\n",
"\n",
"# Use apply again to create the high_side and low_side columns\n",
"merger['high_side'] = merger.apply(lambda row: row['left_side'] if row['left_count'] > row['right_count'] else row['right_side'], axis=1)\n",
"merger['low_side'] = merger.apply(lambda row: row['left_side'] if row['left_count'] <= row['right_count'] else row['right_side'], axis=1)\n",
"\n",
"# Drop the high_count and low_count columns if they are not needed\n",
"# merger.drop(['high_count', 'low_count'], axis=1, inplace=True)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 10,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1916\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"1916it [01:14, 25.70it/s]\n"
]
}
],
"source": [
"fuzzymerger = merger[[\"Country\",\"low_side\",\"high_side\",\"high_count\",\"low_count\",\"similarity\"]].drop_duplicates()\n",
"fuzzymerger = fuzzymerger.sort_values(by=[\"low_side\",\"high_count\"], ascending=[True,False])\n",
"fuzzymerger = fuzzymerger.drop_duplicates(subset=[\"Country\",\"low_side\"]).sort_values(by=\"high_count\", ascending=True).reset_index(drop=True)\n",
"print(len(fuzzymerger))\n",
"univ_harm = univ.copy()\n",
"univ_harm[\"merge_iter\"] = 0\n",
"for i,row in tqdm(fuzzymerger.iterrows()):\n",
" univ_harm.loc[((univ_harm[\"Country\"]==row[\"Country\"])&\n",
" (univ_harm[\"Institution_harm\"]==row[\"low_side\"])),\"merge_iter\"] += 1\n",
" univ_harm.loc[((univ_harm[\"Country\"]==row[\"Country\"])&\n",
" (univ_harm[\"Institution_harm\"]==row[\"low_side\"])),\"Institution_harm\"] = row[\"high_side\"]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 11,
"outputs": [],
"source": [
"# fuzzymerger[fuzzymerger[\"Country\"]==\"China\"]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [],
"source": [
"# univ_harm[((univ_harm[\"merge_iter\"]>0) & (univ_harm[\"Country\"]==\"Italy\"))]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 13,
"outputs": [],
"source": [
"univ_harm.loc[((univ_harm[\"merge_iter\"]>0) & (univ_harm[\"Country\"]==\"Italy\")&\n",
" (univ_harm[\"Institution\"].str.lower().str.contains(\"sapien\"))&\n",
" (univ_harm[\"Institution\"].str.lower().str.contains(\"univ\"))), \"Institution_harm\"] = \"Sapienza Univ Rome\""
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 14,
"outputs": [
{
"data": {
"text/plain": "Institution 19821\nInstitution_harm 16646\ndtype: int64"
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"univ_harm[[\"Institution\",\"Institution_harm\"]].nunique()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 15,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Institution Country \n124019 WOS:000663304500011 Excellium Consulting United Kingdom \\\n126511 WOS:000674472400006 Tongji Univ China \n33359 WOS:000391252900006 Beihang Univ China \n153729 WOS:000787596500003 Fudan Univ China \n69317 WOS:000467564700105 Guangdong Univ Technol China \n... ... ... ... \n160384 WOS:000812531900012 Zhengzhou Univ China \n133501 WOS:000702637000007 Univ Luxembourg Luxembourg \n140349 WOS:000728149000027 Regina Montis Regalis Hosp Italy \n160632 WOS:000813959600003 Liyang Peoples Hosp China \n174735 WOS:000888555700002 Swiss Fed Inst Technol Switzerland \n\n Institution_harm merge_iter \n124019 Excellium Consulting 0 \n126511 Tongji Univ 0 \n33359 Beihang Univ 0 \n153729 Fudan Univ 0 \n69317 Guangdong Univ Technol 0 \n... ... ... \n160384 Zhengzhou Univ 0 \n133501 Univ Luxembourg 0 \n140349 Regina Montis Regalis Hosp 0 \n160632 Liyang Peoples Hosp 0 \n174735 Swiss Fed Inst Technol 0 \n\n[500 rows x 5 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Institution</th>\n <th>Country</th>\n <th>Institution_harm</th>\n <th>merge_iter</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>124019</th>\n <td>WOS:000663304500011</td>\n <td>Excellium Consulting</td>\n <td>United Kingdom</td>\n <td>Excellium Consulting</td>\n <td>0</td>\n </tr>\n <tr>\n <th>126511</th>\n <td>WOS:000674472400006</td>\n <td>Tongji Univ</td>\n <td>China</td>\n <td>Tongji Univ</td>\n <td>0</td>\n </tr>\n <tr>\n <th>33359</th>\n <td>WOS:000391252900006</td>\n <td>Beihang Univ</td>\n <td>China</td>\n <td>Beihang Univ</td>\n <td>0</td>\n </tr>\n <tr>\n <th>153729</th>\n <td>WOS:000787596500003</td>\n <td>Fudan Univ</td>\n <td>China</td>\n <td>Fudan Univ</td>\n <td>0</td>\n </tr>\n <tr>\n <th>69317</th>\n <td>WOS:000467564700105</td>\n <td>Guangdong Univ Technol</td>\n <td>China</td>\n <td>Guangdong Univ Technol</td>\n <td>0</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>160384</th>\n <td>WOS:000812531900012</td>\n <td>Zhengzhou Univ</td>\n <td>China</td>\n <td>Zhengzhou Univ</td>\n <td>0</td>\n </tr>\n <tr>\n <th>133501</th>\n <td>WOS:000702637000007</td>\n <td>Univ Luxembourg</td>\n <td>Luxembourg</td>\n <td>Univ Luxembourg</td>\n <td>0</td>\n </tr>\n <tr>\n <th>140349</th>\n <td>WOS:000728149000027</td>\n <td>Regina Montis Regalis Hosp</td>\n <td>Italy</td>\n <td>Regina Montis Regalis Hosp</td>\n <td>0</td>\n </tr>\n <tr>\n <th>160632</th>\n <td>WOS:000813959600003</td>\n <td>Liyang Peoples Hosp</td>\n <td>China</td>\n <td>Liyang Peoples Hosp</td>\n <td>0</td>\n </tr>\n <tr>\n <th>174735</th>\n <td>WOS:000888555700002</td>\n <td>Swiss Fed Inst Technol</td>\n <td>Switzerland</td>\n <td>Swiss Fed Inst Technol</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n<p>500 rows × 5 columns</p>\n</div>"
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"univ_harm.sample(500)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 17,
"outputs": [
{
"data": {
"text/plain": "merge_iter\n0 174128\n1 3966\n2 521\n3 22\n4 1\nName: count, dtype: int64"
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"univ_harm[\"merge_iter\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 15,
"outputs": [],
"source": [
"univ_harm.to_excel(f\"{outdir}/wos_institution_locations_harmonized.xlsx\", index=False)"
],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}