You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ZSI_Reconnect_China/WOS/wos_univ_normalizer.ipynb

488 lines
26 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"outputs": [],
"source": [
"import pandas as pd\n",
"# Importing libraries and module and some setting for notebook\n",
"\n",
"import pandas as pd\n",
"import re\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"import numpy as np\n",
"from scipy.sparse import csr_matrix\n",
"import sparse_dot_topn.sparse_dot_topn as ct #Cosine Similarity\n",
"import time\n",
"from tqdm import tqdm"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [],
"source": [
"def wikinorm(univ_string):\n",
" from googlesearch import search\n",
" from nltk.metrics import edit_distance\n",
" from operator import itemgetter\n",
" from numpy.random import default_rng\n",
" rng = default_rng()\n",
" results = search(univ_string, lang=\"en\", num_results=3,advanced=True, sleep_interval=rng.uniform(1, 5))\n",
" univ_name = univ_string.split(\",\")[0]\n",
" u_results = [i.title for i in results if \"Category:\" not in i.title]\n",
" return sorted([tuple((j,edit_distance(univ_name, j))) for j in u_results],key=itemgetter(1))[0][0]\n"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 3,
"outputs": [],
"source": [
"def replace_uppercase_words(text):\n",
" words = text.split()\n",
" all_uppercase = all(word.isupper() for word in words)\n",
" all_lowercase = all(word.islower() for word in words)\n",
" if all_uppercase or all_lowercase:\n",
" return text\n",
" else:\n",
" result = []\n",
" for word in words:\n",
" w = word.strip()\n",
" if not w.isupper() and not w.islower():\n",
" result.append(w)\n",
" return \" \".join(result).strip()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 4,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO: Pandarallel will run on 4 workers.\n",
"INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.\n",
"\n",
"WARNING: You are on Windows. If you detect any issue with pandarallel, be sure you checked out the Troubleshooting page:\n",
"https://nalepae.github.io/pandarallel/troubleshooting/\n"
]
},
{
"data": {
"text/plain": "VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=44660), Label(value='0 / 44660')))…",
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "92c1cd6c14644ffeb042b38f5d5d98c5"
}
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"178638\n"
]
}
],
"source": [
"outdir=\"wos_processed_data\"\n",
"univ = pd.read_excel(f\"{outdir}/wos_institution_locations.xlsx\")\n",
"\n",
"from pandarallel import pandarallel\n",
"pandarallel.initialize(progress_bar=True, nb_workers=4)\n",
"\n",
"univ[\"Institution_harm\"] = univ[\"Institution\"].parallel_apply(replace_uppercase_words)\n",
"print(len(univ))"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 5,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Institution \n153271 WOS:000784587900008 Univ Pisa \\\n159800 WOS:000810042500002 China Japan Friendship Hosp \n130931 WOS:000691922800007 Karl Franzens Univ Graz \n1500 WOS:000292944600012 CNR \n113964 WOS:000618210000032 Karolinska Univ Hosp \n... ... ... \n160284 WOS:000812227000009 Univ Appl Sci Upper Austria \n29314 WOS:000381396400013 Univ Southampton \n17045 WOS:000347046200017 Charles Univ Prague \n164118 WOS:000832954200001 Nanjing Univ Aeronaut & Astronaut \n109992 WOS:000604257500070 KTH Royal Inst Technol \n\n Country Institution_harm \n153271 Italy Univ Pisa \n159800 China China Japan Friendship Hosp \n130931 Austria Karl Franzens Univ Graz \n1500 Italy CNR \n113964 Sweden Karolinska Univ Hosp \n... ... ... \n160284 Austria Univ Appl Sci Upper Austria \n29314 United Kingdom Univ Southampton \n17045 Czech Republic Charles Univ Prague \n164118 China Nanjing Univ Aeronaut & Astronaut \n109992 Sweden Royal Inst Technol \n\n[100 rows x 4 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Institution</th>\n <th>Country</th>\n <th>Institution_harm</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>153271</th>\n <td>WOS:000784587900008</td>\n <td>Univ Pisa</td>\n <td>Italy</td>\n <td>Univ Pisa</td>\n </tr>\n <tr>\n <th>159800</th>\n <td>WOS:000810042500002</td>\n <td>China Japan Friendship Hosp</td>\n <td>China</td>\n <td>China Japan Friendship Hosp</td>\n </tr>\n <tr>\n <th>130931</th>\n <td>WOS:000691922800007</td>\n <td>Karl Franzens Univ Graz</td>\n <td>Austria</td>\n <td>Karl Franzens Univ Graz</td>\n </tr>\n <tr>\n <th>1500</th>\n <td>WOS:000292944600012</td>\n <td>CNR</td>\n <td>Italy</td>\n <td>CNR</td>\n </tr>\n <tr>\n <th>113964</th>\n <td>WOS:000618210000032</td>\n <td>Karolinska Univ Hosp</td>\n <td>Sweden</td>\n <td>Karolinska Univ Hosp</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>160284</th>\n <td>WOS:000812227000009</td>\n <td>Univ Appl Sci Upper Austria</td>\n <td>Austria</td>\n <td>Univ Appl Sci Upper Austria</td>\n </tr>\n <tr>\n <th>29314</th>\n <td>WOS:000381396400013</td>\n <td>Univ Southampton</td>\n <td>United Kingdom</td>\n <td>Univ Southampton</td>\n </tr>\n <tr>\n <th>17045</th>\n <td>WOS:000347046200017</td>\n <td>Charles Univ Prague</td>\n <td>Czech Republic</td>\n <td>Charles Univ Prague</td>\n </tr>\n <tr>\n <th>164118</th>\n <td>WOS:000832954200001</td>\n <td>Nanjing Univ Aeronaut &amp; Astronaut</td>\n <td>China</td>\n <td>Nanjing Univ Aeronaut &amp; Astronaut</td>\n </tr>\n <tr>\n <th>109992</th>\n <td>WOS:000604257500070</td>\n <td>KTH Royal Inst Technol</td>\n <td>Sweden</td>\n <td>Royal Inst Technol</td>\n </tr>\n </tbody>\n</table>\n<p>100 rows × 4 columns</p>\n</div>"
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"univ.sample(100)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 6,
"outputs": [
{
"data": {
"text/plain": " Country Institution_harm count\n8168 Croatia Inst Adriat Crops & Karst Reclamat 1\n3417 China Ctr Eye & Vis Res 1\n1034 China Westlake Inst Adv Study 13\n13427 Italy Macerata Hosp 1\n8071 China Key Lab Ecoind Green Technol Fujian Prov 1\n... ... ... ...\n17230 United Kingdom Univ Kingston 6\n8847 France Univ Artois 8\n16071 Spain Catalonia Geriatr & Gerontol Soc 1\n6357 China Wuxi Huace Elect Syst Co Ltd 1\n9049 France Excelia Business Sch 3\n\n[100 rows x 3 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Country</th>\n <th>Institution_harm</th>\n <th>count</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>8168</th>\n <td>Croatia</td>\n <td>Inst Adriat Crops &amp; Karst Reclamat</td>\n <td>1</td>\n </tr>\n <tr>\n <th>3417</th>\n <td>China</td>\n <td>Ctr Eye &amp; Vis Res</td>\n <td>1</td>\n </tr>\n <tr>\n <th>1034</th>\n <td>China</td>\n <td>Westlake Inst Adv Study</td>\n <td>13</td>\n </tr>\n <tr>\n <th>13427</th>\n <td>Italy</td>\n <td>Macerata Hosp</td>\n <td>1</td>\n </tr>\n <tr>\n <th>8071</th>\n <td>China</td>\n <td>Key Lab Ecoind Green Technol Fujian Prov</td>\n <td>1</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>17230</th>\n <td>United Kingdom</td>\n <td>Univ Kingston</td>\n <td>6</td>\n </tr>\n <tr>\n <th>8847</th>\n <td>France</td>\n <td>Univ Artois</td>\n <td>8</td>\n </tr>\n <tr>\n <th>16071</th>\n <td>Spain</td>\n <td>Catalonia Geriatr &amp; Gerontol Soc</td>\n <td>1</td>\n </tr>\n <tr>\n <th>6357</th>\n <td>China</td>\n <td>Wuxi Huace Elect Syst Co Ltd</td>\n <td>1</td>\n </tr>\n <tr>\n <th>9049</th>\n <td>France</td>\n <td>Excelia Business Sch</td>\n <td>3</td>\n </tr>\n </tbody>\n</table>\n<p>100 rows × 3 columns</p>\n</div>"
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"univ_norm = univ.groupby(\"Country\", as_index=False)[\"Institution_harm\"].value_counts()\n",
"# univ_norm[\"search_for\"] = univ_norm[\"Institution\"]+\", \" + univ_norm[\"Country\"]+ \", wikipedia\"\n",
"univ_norm.sample(100)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 7,
"outputs": [],
"source": [
"# from pandarallel import pandarallel\n",
"# pandarallel.initialize(progress_bar=True, nb_workers=2)\n",
"#\n",
"# df_sample[\"search_result\"] = df_sample[\"search_for\"].parallel_apply(wikinorm)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 8,
"outputs": [],
"source": [
"def ngrams(string, n=3):\n",
"\n",
" string = re.sub(r'[,-./]|\\sBD',r'', string)\n",
" ngrams = zip(*[string[i:] for i in range(n)])\n",
" return [''.join(ngram) for ngram in ngrams]\n",
"\n",
"# calculate the similarity between two vectors of TF-IDF values the Cosine Similarity is usually used.\n",
"# result matrix in a very sparse terms and Scikit-learn deals with this nicely by returning a sparse CSR matrix.\n",
"\n",
"def awesome_cossim_top(A, B, ntop, lower_bound=0):\n",
" # force A and B as a CSR matrix.\n",
" # If they have already been CSR, there is no overhead\n",
" A = A.tocsr()\n",
" B = B.tocsr()\n",
" M, _ = A.shape\n",
" _, N = B.shape\n",
"\n",
" idx_dtype = np.int32\n",
"\n",
" nnz_max = M*ntop\n",
"\n",
" indptr = np.zeros(M+1, dtype=idx_dtype)\n",
" indices = np.zeros(nnz_max, dtype=idx_dtype)\n",
" data = np.zeros(nnz_max, dtype=A.dtype)\n",
"\n",
" ct.sparse_dot_topn(\n",
" M, N, np.asarray(A.indptr, dtype=idx_dtype),\n",
" np.asarray(A.indices, dtype=idx_dtype),\n",
" A.data,\n",
" np.asarray(B.indptr, dtype=idx_dtype),\n",
" np.asarray(B.indices, dtype=idx_dtype),\n",
" B.data,\n",
" ntop,\n",
" lower_bound,\n",
" indptr, indices, data)\n",
"\n",
" return csr_matrix((data,indices,indptr),shape=(M,N))\n",
"\n",
"# unpacks the resulting sparse matrix\n",
"\n",
"def get_matches_df(sparse_matrix, name_vector, top=None):\n",
" non_zeros = sparse_matrix.nonzero()\n",
"\n",
" sparserows = non_zeros[0]\n",
" sparsecols = non_zeros[1]\n",
"\n",
" if top:\n",
" nr_matches = top\n",
" else:\n",
" nr_matches = sparsecols.size\n",
"\n",
" left_side = np.empty([nr_matches], dtype=object)\n",
" right_side = np.empty([nr_matches], dtype=object)\n",
" similarity = np.zeros(nr_matches)\n",
"\n",
" for index in range(0, nr_matches):\n",
" left_side[index] = name_vector[sparserows[index]]\n",
" right_side[index] = name_vector[sparsecols[index]]\n",
" similarity[index] = sparse_matrix.data[index]\n",
"\n",
" return pd.DataFrame({'left_side': left_side,\n",
" 'right_side': right_side,\n",
" 'similarity': similarity})\n",
"\n",
"\n",
"def discrepancy_filter(df):\n",
" f_df = df.copy()\n",
" tokenlist = [\"Med\", \"Hosp\", \"Tech\", \"Univ\", \"Acad\", \"Poly\"]\n",
" for token in tokenlist:\n",
" f_df = f_df[~(((f_df[\"right_side\"].str.contains(token))&\n",
" (~f_df[\"left_side\"].str.contains(token)))\n",
" |\n",
" ((f_df[\"left_side\"].str.contains(token))&\n",
" (~f_df[\"right_side\"].str.contains(token))))].copy()\n",
" return f_df\n",
"\n",
"\n",
"# Define a function to get the high and low counts for each row\n",
"def get_high_low_counts(row):\n",
" if row['left_count'] > row['right_count']:\n",
" high_count = row['left_count']\n",
" low_count = row['right_count']\n",
" else: #row['left_count'] < row['right_count']:\n",
" high_count = row['right_count']\n",
" low_count = row['left_count']\n",
" # else:\n",
" # if len(row['left_side']) > len(row['right_side']):\n",
" # high_count = len(row['left_side'])\n",
" # low_count = len(row['right_side'])\n",
" # else:\n",
" # high_count = len(row['right_side'])\n",
" # low_count = len(row['left_side'])\n",
" return pd.Series([high_count, low_count])"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 9,
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 31/31 [00:00<00:00, 31.97it/s]\n"
]
}
],
"source": [
"merger = pd.DataFrame()\n",
"\n",
"# for i in tqdm(filter(lambda c: c!=\"China\", list(univ_norm[\"Country\"].unique()))):\n",
"for i in tqdm(list(univ_norm[\"Country\"].unique())):\n",
" sub_inst = univ_norm[univ_norm[\"Country\"]==i].reset_index()\n",
" types = sub_inst['Institution_harm']\n",
" vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)\n",
" tf_idf_matrix = vectorizer.fit_transform(types)\n",
" t1 = time.time()\n",
" matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.8 if i!=\"China\" else 0.9)\n",
" t = time.time()-t1\n",
"\n",
" # store the matches into new dataframe called matched_df and printing 10 samples\n",
" matches_df = get_matches_df(matches, types)\n",
" matches_df = matches_df[matches_df['similarity'] < 0.99999] # For removing all exact matches\n",
" matches_df = discrepancy_filter(matches_df).reset_index(drop=True)\n",
" matches_df[\"Country\"] = i\n",
" # matches_df = matches_df[pd.DataFrame(np.sort(matches_df[['left_side','right_side']].values,1)).duplicated()]\n",
" # matches_df = matches_df[~matches_df[['left_side', 'right_side']].apply(frozenset, axis=1).duplicated()]\n",
" merger = pd.concat([merger,matches_df], ignore_index=True)\n",
"\n",
"for s in [\"left\",\"right\"]:\n",
" merger[f\"{s}_count\"] = merger[f\"{s}_side\"].apply(lambda x: len(univ[univ[\"Institution_harm\"] == x]))\n",
"\n",
"# Apply the function to create a new column\n",
"merger[['high_count', 'low_count']] = merger.apply(get_high_low_counts, axis=1)\n",
"\n",
"# Use apply again to create the high_side and low_side columns\n",
"merger['high_side'] = merger.apply(lambda row: row['left_side'] if row['left_count'] > row['right_count'] else row['right_side'], axis=1)\n",
"merger['low_side'] = merger.apply(lambda row: row['left_side'] if row['left_count'] <= row['right_count'] else row['right_side'], axis=1)\n",
"\n",
"# Drop the high_count and low_count columns if they are not needed\n",
"# merger.drop(['high_count', 'low_count'], axis=1, inplace=True)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 10,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1916\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"1916it [01:11, 26.94it/s]\n"
]
}
],
"source": [
"fuzzymerger = merger[[\"Country\",\"low_side\",\"high_side\",\"high_count\",\"low_count\",\"similarity\"]].drop_duplicates()\n",
"fuzzymerger = fuzzymerger.sort_values(by=[\"low_side\",\"high_count\"], ascending=[True,False])\n",
"fuzzymerger = fuzzymerger.drop_duplicates(subset=[\"Country\",\"low_side\"]).sort_values(by=\"high_count\", ascending=True).reset_index(drop=True)\n",
"print(len(fuzzymerger))\n",
"univ_harm = univ.copy()\n",
"univ_harm[\"merge_iter\"] = 0\n",
"for i,row in tqdm(fuzzymerger.iterrows()):\n",
" univ_harm.loc[((univ_harm[\"Country\"]==row[\"Country\"])&\n",
" (univ_harm[\"Institution_harm\"]==row[\"low_side\"])),\"merge_iter\"] += 1\n",
" univ_harm.loc[((univ_harm[\"Country\"]==row[\"Country\"])&\n",
" (univ_harm[\"Institution_harm\"]==row[\"low_side\"])),\"Institution_harm\"] = row[\"high_side\"]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 11,
"outputs": [],
"source": [
"# fuzzymerger[fuzzymerger[\"Country\"]==\"China\"]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [],
"source": [
"# univ_harm[((univ_harm[\"merge_iter\"]>0) & (univ_harm[\"Country\"]==\"Italy\"))]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 13,
"outputs": [],
"source": [
"univ_harm.loc[((univ_harm[\"merge_iter\"]>0) & (univ_harm[\"Country\"]==\"Italy\")&\n",
" (univ_harm[\"Institution\"].str.lower().str.contains(\"sapien\"))&\n",
" (univ_harm[\"Institution\"].str.lower().str.contains(\"univ\"))), \"Institution_harm\"] = \"Sapienza Univ Rome\""
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 14,
"outputs": [
{
"data": {
"text/plain": "Institution 19821\nInstitution_harm 16646\ndtype: int64"
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"univ_harm[[\"Institution\",\"Institution_harm\"]].nunique()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 16,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Institution Country \n49282 WOS:000428099700011 Univ Sheffield United Kingdom \\\n51975 WOS:000432981300002 Chinese Acad Sci China \n64618 WOS:000459693000011 Babes Bolyai Univ Romania \n163145 WOS:000828102100001 Xidian Univ China \n99690 WOS:000566510600001 Fora Forest Technol Spain \n... ... ... ... \n1567 WOS:000293492500004 Univ Essex United Kingdom \n73076 WOS:000476471800022 Shanghai Univ China \n137096 WOS:000715426400001 Queen Mary Hosp China \n164978 WOS:000836819000003 Manchester Metropolitan Univ United Kingdom \n32973 WOS:000390181300013 Univ Complutense Madrid Spain \n\n Institution_harm merge_iter \n49282 Univ Sheffield 0 \n51975 Chinese Acad Sci 0 \n64618 Babes Bolyai Univ 0 \n163145 Xidian Univ 0 \n99690 Fora Forest Technol 0 \n... ... ... \n1567 Univ Essex 0 \n73076 Shanghai Univ 0 \n137096 Queen Mary Hosp 0 \n164978 Manchester Metropolitan Univ 0 \n32973 Univ Complutense Madrid 0 \n\n[500 rows x 5 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Institution</th>\n <th>Country</th>\n <th>Institution_harm</th>\n <th>merge_iter</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>49282</th>\n <td>WOS:000428099700011</td>\n <td>Univ Sheffield</td>\n <td>United Kingdom</td>\n <td>Univ Sheffield</td>\n <td>0</td>\n </tr>\n <tr>\n <th>51975</th>\n <td>WOS:000432981300002</td>\n <td>Chinese Acad Sci</td>\n <td>China</td>\n <td>Chinese Acad Sci</td>\n <td>0</td>\n </tr>\n <tr>\n <th>64618</th>\n <td>WOS:000459693000011</td>\n <td>Babes Bolyai Univ</td>\n <td>Romania</td>\n <td>Babes Bolyai Univ</td>\n <td>0</td>\n </tr>\n <tr>\n <th>163145</th>\n <td>WOS:000828102100001</td>\n <td>Xidian Univ</td>\n <td>China</td>\n <td>Xidian Univ</td>\n <td>0</td>\n </tr>\n <tr>\n <th>99690</th>\n <td>WOS:000566510600001</td>\n <td>Fora Forest Technol</td>\n <td>Spain</td>\n <td>Fora Forest Technol</td>\n <td>0</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>1567</th>\n <td>WOS:000293492500004</td>\n <td>Univ Essex</td>\n <td>United Kingdom</td>\n <td>Univ Essex</td>\n <td>0</td>\n </tr>\n <tr>\n <th>73076</th>\n <td>WOS:000476471800022</td>\n <td>Shanghai Univ</td>\n <td>China</td>\n <td>Shanghai Univ</td>\n <td>0</td>\n </tr>\n <tr>\n <th>137096</th>\n <td>WOS:000715426400001</td>\n <td>Queen Mary Hosp</td>\n <td>China</td>\n <td>Queen Mary Hosp</td>\n <td>0</td>\n </tr>\n <tr>\n <th>164978</th>\n <td>WOS:000836819000003</td>\n <td>Manchester Metropolitan Univ</td>\n <td>United Kingdom</td>\n <td>Manchester Metropolitan Univ</td>\n <td>0</td>\n </tr>\n <tr>\n <th>32973</th>\n <td>WOS:000390181300013</td>\n <td>Univ Complutense Madrid</td>\n <td>Spain</td>\n <td>Univ Complutense Madrid</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n<p>500 rows × 5 columns</p>\n</div>"
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"univ_harm.sample(500)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 15,
"outputs": [],
"source": [
"univ_harm.to_excel(f\"{outdir}/wos_institution_locations_harmonized.xlsx\", index=False)"
],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}