{ "cells": [ { "cell_type": "code", "execution_count": 1, "outputs": [], "source": [ "import pandas as pd\n", "# Importing libraries and module and some setting for notebook\n", "\n", "import pandas as pd\n", "import re\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "import numpy as np\n", "from scipy.sparse import csr_matrix\n", "import sparse_dot_topn.sparse_dot_topn as ct #Cosine Similarity\n", "import time\n", "from tqdm import tqdm" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 2, "outputs": [], "source": [ "def wikinorm(univ_string):\n", " from googlesearch import search\n", " from nltk.metrics import edit_distance\n", " from operator import itemgetter\n", " from numpy.random import default_rng\n", " rng = default_rng()\n", " results = search(univ_string, lang=\"en\", num_results=3,advanced=True, sleep_interval=rng.uniform(1, 5))\n", " univ_name = univ_string.split(\",\")[0]\n", " u_results = [i.title for i in results if \"Category:\" not in i.title]\n", " return sorted([tuple((j,edit_distance(univ_name, j))) for j in u_results],key=itemgetter(1))[0][0]\n" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 3, "outputs": [], "source": [ "def replace_uppercase_words(text):\n", " words = text.split()\n", " all_uppercase = all(word.isupper() for word in words)\n", " all_lowercase = all(word.islower() for word in words)\n", " if all_uppercase or all_lowercase:\n", " return text\n", " else:\n", " result = []\n", " for word in words:\n", " w = word.strip()\n", " if not w.isupper() and not w.islower():\n", " result.append(w)\n", " return \" \".join(result).strip()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 4, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO: Pandarallel will run on 4 workers.\n", "INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.\n", "\n", "WARNING: You are on Windows. If you detect any issue with pandarallel, be sure you checked out the Troubleshooting page:\n", "https://nalepae.github.io/pandarallel/troubleshooting/\n" ] }, { "data": { "text/plain": "VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=44660), Label(value='0 / 44660')))…", "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "5f8bead5565146a5843c01b81b77cf9f" } }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "178638\n" ] } ], "source": [ "outdir=\"wos_processed_data\"\n", "univ = pd.read_excel(f\"{outdir}/wos_institution_locations.xlsx\")\n", "\n", "from pandarallel import pandarallel\n", "pandarallel.initialize(progress_bar=True, nb_workers=4)\n", "\n", "univ[\"Institution_harm\"] = univ[\"Institution\"].parallel_apply(replace_uppercase_words)\n", "print(len(univ))" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 5, "outputs": [ { "data": { "text/plain": " UT (Unique WOS ID) Institution \n149037 WOS:000764953300001 Univ Elect Sci & Technol China \\\n86834 WOS:000519526500027 Radboud Univ Nijmegen \n143915 WOS:000739917304088 Swiss Fed Inst Technol \n135117 WOS:000707680800001 North China Elect Power Univ Beijing \n110390 WOS:000605608700001 Imperial Coll London \n... ... ... \n21250 WOS:000358912300001 Jilin Univ \n23018 WOS:000364230600002 Tampere Univ Technol \n126847 WOS:000675855300001 Univ Copenhagen \n15313 WOS:000343701400001 Univ Siena \n77834 WOS:000490147400012 Tsinghua Univ \n\n Country Institution_harm \n149037 China Univ Elect Sci & Technol China \n86834 Netherlands Radboud Univ Nijmegen \n143915 Switzerland Swiss Fed Inst Technol \n135117 China North China Elect Power Univ Beijing \n110390 United Kingdom Imperial Coll London \n... ... ... \n21250 China Jilin Univ \n23018 Finland Tampere Univ Technol \n126847 Denmark Univ Copenhagen \n15313 Italy Univ Siena \n77834 China Tsinghua Univ \n\n[100 rows x 4 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
UT (Unique WOS ID)InstitutionCountryInstitution_harm
149037WOS:000764953300001Univ Elect Sci & Technol ChinaChinaUniv Elect Sci & Technol China
86834WOS:000519526500027Radboud Univ NijmegenNetherlandsRadboud Univ Nijmegen
143915WOS:000739917304088Swiss Fed Inst TechnolSwitzerlandSwiss Fed Inst Technol
135117WOS:000707680800001North China Elect Power Univ BeijingChinaNorth China Elect Power Univ Beijing
110390WOS:000605608700001Imperial Coll LondonUnited KingdomImperial Coll London
...............
21250WOS:000358912300001Jilin UnivChinaJilin Univ
23018WOS:000364230600002Tampere Univ TechnolFinlandTampere Univ Technol
126847WOS:000675855300001Univ CopenhagenDenmarkUniv Copenhagen
15313WOS:000343701400001Univ SienaItalyUniv Siena
77834WOS:000490147400012Tsinghua UnivChinaTsinghua Univ
\n

100 rows × 4 columns

\n
" }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "univ.sample(100)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 6, "outputs": [ { "data": { "text/plain": " Country Institution_harm \n496 Belgium Haute Ecole Louvain Hainaut \\\n10566 Germany IQM \n6670 China Tiantan Hosp \n16974 Switzerland Species Survival Commiss Mushroom Bracket & \n9200 France Hop Hotel Dieu Paris \n... ... ... \n11326 Germany Int Max Planck Res Sch Earth Syst Modeling \n2874 China China Natl Nucl Corp 416 Hosp \n17967 United Kingdom AccelerComm Ltd \n14041 Netherlands Vankeulen Advies \n4148 China First Peoples Hosp Jingmen Affiliated Hubei Minzu \n\n count \n496 1 \n10566 4 \n6670 1 \n16974 1 \n9200 2 \n... ... \n11326 1 \n2874 2 \n17967 1 \n14041 1 \n4148 1 \n\n[100 rows x 3 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
CountryInstitution_harmcount
496BelgiumHaute Ecole Louvain Hainaut1
10566GermanyIQM4
6670ChinaTiantan Hosp1
16974SwitzerlandSpecies Survival Commiss Mushroom Bracket &1
9200FranceHop Hotel Dieu Paris2
............
11326GermanyInt Max Planck Res Sch Earth Syst Modeling1
2874ChinaChina Natl Nucl Corp 416 Hosp2
17967United KingdomAccelerComm Ltd1
14041NetherlandsVankeulen Advies1
4148ChinaFirst Peoples Hosp Jingmen Affiliated Hubei Minzu1
\n

100 rows × 3 columns

\n
" }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "univ_norm = univ.groupby(\"Country\", as_index=False)[\"Institution_harm\"].value_counts()\n", "# univ_norm[\"search_for\"] = univ_norm[\"Institution\"]+\", \" + univ_norm[\"Country\"]+ \", wikipedia\"\n", "univ_norm.sample(100)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 7, "outputs": [], "source": [ "# from pandarallel import pandarallel\n", "# pandarallel.initialize(progress_bar=True, nb_workers=2)\n", "#\n", "# df_sample[\"search_result\"] = df_sample[\"search_for\"].parallel_apply(wikinorm)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 8, "outputs": [], "source": [ "def ngrams(string, n=3):\n", "\n", " string = re.sub(r'[,-./]|\\sBD',r'', string)\n", " ngrams = zip(*[string[i:] for i in range(n)])\n", " return [''.join(ngram) for ngram in ngrams]\n", "\n", "# calculate the similarity between two vectors of TF-IDF values the Cosine Similarity is usually used.\n", "# result matrix in a very sparse terms and Scikit-learn deals with this nicely by returning a sparse CSR matrix.\n", "\n", "def awesome_cossim_top(A, B, ntop, lower_bound=0):\n", " # force A and B as a CSR matrix.\n", " # If they have already been CSR, there is no overhead\n", " A = A.tocsr()\n", " B = B.tocsr()\n", " M, _ = A.shape\n", " _, N = B.shape\n", "\n", " idx_dtype = np.int32\n", "\n", " nnz_max = M*ntop\n", "\n", " indptr = np.zeros(M+1, dtype=idx_dtype)\n", " indices = np.zeros(nnz_max, dtype=idx_dtype)\n", " data = np.zeros(nnz_max, dtype=A.dtype)\n", "\n", " ct.sparse_dot_topn(\n", " M, N, np.asarray(A.indptr, dtype=idx_dtype),\n", " np.asarray(A.indices, dtype=idx_dtype),\n", " A.data,\n", " np.asarray(B.indptr, dtype=idx_dtype),\n", " np.asarray(B.indices, dtype=idx_dtype),\n", " B.data,\n", " ntop,\n", " lower_bound,\n", " indptr, indices, data)\n", "\n", " return csr_matrix((data,indices,indptr),shape=(M,N))\n", "\n", "# unpacks the resulting sparse matrix\n", "\n", "def get_matches_df(sparse_matrix, name_vector, top=None):\n", " non_zeros = sparse_matrix.nonzero()\n", "\n", " sparserows = non_zeros[0]\n", " sparsecols = non_zeros[1]\n", "\n", " if top:\n", " nr_matches = top\n", " else:\n", " nr_matches = sparsecols.size\n", "\n", " left_side = np.empty([nr_matches], dtype=object)\n", " right_side = np.empty([nr_matches], dtype=object)\n", " similarity = np.zeros(nr_matches)\n", "\n", " for index in range(0, nr_matches):\n", " left_side[index] = name_vector[sparserows[index]]\n", " right_side[index] = name_vector[sparsecols[index]]\n", " similarity[index] = sparse_matrix.data[index]\n", "\n", " return pd.DataFrame({'left_side': left_side,\n", " 'right_side': right_side,\n", " 'similarity': similarity})\n", "\n", "\n", "def discrepancy_filter(df):\n", " f_df = df.copy()\n", " tokenlist = [\"Med\", \"Hosp\", \"Tech\", \"Univ\", \"Acad\", \"Poly\"]\n", " for token in tokenlist:\n", " f_df = f_df[~(((f_df[\"right_side\"].str.contains(token))&\n", " (~f_df[\"left_side\"].str.contains(token)))\n", " |\n", " ((f_df[\"left_side\"].str.contains(token))&\n", " (~f_df[\"right_side\"].str.contains(token))))].copy()\n", " return f_df\n", "\n", "\n", "# Define a function to get the high and low counts for each row\n", "def get_high_low_counts(row):\n", " if row['left_count'] > row['right_count']:\n", " high_count = row['left_count']\n", " low_count = row['right_count']\n", " else: #row['left_count'] < row['right_count']:\n", " high_count = row['right_count']\n", " low_count = row['left_count']\n", " # else:\n", " # if len(row['left_side']) > len(row['right_side']):\n", " # high_count = len(row['left_side'])\n", " # low_count = len(row['right_side'])\n", " # else:\n", " # high_count = len(row['right_side'])\n", " # low_count = len(row['left_side'])\n", " return pd.Series([high_count, low_count])" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 9, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 31/31 [00:01<00:00, 27.86it/s]\n" ] } ], "source": [ "merger = pd.DataFrame()\n", "\n", "# for i in tqdm(filter(lambda c: c!=\"China\", list(univ_norm[\"Country\"].unique()))):\n", "for i in tqdm(list(univ_norm[\"Country\"].unique())):\n", " sub_inst = univ_norm[univ_norm[\"Country\"]==i].reset_index()\n", " types = sub_inst['Institution_harm']\n", " vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)\n", " tf_idf_matrix = vectorizer.fit_transform(types)\n", " t1 = time.time()\n", " matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.8 if i!=\"China\" else 0.9)\n", " t = time.time()-t1\n", "\n", " # store the matches into new dataframe called matched_df and printing 10 samples\n", " matches_df = get_matches_df(matches, types)\n", " matches_df = matches_df[matches_df['similarity'] < 0.99999] # For removing all exact matches\n", " matches_df = discrepancy_filter(matches_df).reset_index(drop=True)\n", " matches_df[\"Country\"] = i\n", " # matches_df = matches_df[pd.DataFrame(np.sort(matches_df[['left_side','right_side']].values,1)).duplicated()]\n", " # matches_df = matches_df[~matches_df[['left_side', 'right_side']].apply(frozenset, axis=1).duplicated()]\n", " merger = pd.concat([merger,matches_df], ignore_index=True)\n", "\n", "for s in [\"left\",\"right\"]:\n", " merger[f\"{s}_count\"] = merger[f\"{s}_side\"].apply(lambda x: len(univ[univ[\"Institution_harm\"] == x]))\n", "\n", "# Apply the function to create a new column\n", "merger[['high_count', 'low_count']] = merger.apply(get_high_low_counts, axis=1)\n", "\n", "# Use apply again to create the high_side and low_side columns\n", "merger['high_side'] = merger.apply(lambda row: row['left_side'] if row['left_count'] > row['right_count'] else row['right_side'], axis=1)\n", "merger['low_side'] = merger.apply(lambda row: row['left_side'] if row['left_count'] <= row['right_count'] else row['right_side'], axis=1)\n", "\n", "# Drop the high_count and low_count columns if they are not needed\n", "# merger.drop(['high_count', 'low_count'], axis=1, inplace=True)" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 10, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1916\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "1916it [01:14, 25.70it/s]\n" ] } ], "source": [ "fuzzymerger = merger[[\"Country\",\"low_side\",\"high_side\",\"high_count\",\"low_count\",\"similarity\"]].drop_duplicates()\n", "fuzzymerger = fuzzymerger.sort_values(by=[\"low_side\",\"high_count\"], ascending=[True,False])\n", "fuzzymerger = fuzzymerger.drop_duplicates(subset=[\"Country\",\"low_side\"]).sort_values(by=\"high_count\", ascending=True).reset_index(drop=True)\n", "print(len(fuzzymerger))\n", "univ_harm = univ.copy()\n", "univ_harm[\"merge_iter\"] = 0\n", "for i,row in tqdm(fuzzymerger.iterrows()):\n", " univ_harm.loc[((univ_harm[\"Country\"]==row[\"Country\"])&\n", " (univ_harm[\"Institution_harm\"]==row[\"low_side\"])),\"merge_iter\"] += 1\n", " univ_harm.loc[((univ_harm[\"Country\"]==row[\"Country\"])&\n", " (univ_harm[\"Institution_harm\"]==row[\"low_side\"])),\"Institution_harm\"] = row[\"high_side\"]" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 11, "outputs": [], "source": [ "# fuzzymerger[fuzzymerger[\"Country\"]==\"China\"]" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 12, "outputs": [], "source": [ "# univ_harm[((univ_harm[\"merge_iter\"]>0) & (univ_harm[\"Country\"]==\"Italy\"))]" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 13, "outputs": [], "source": [ "univ_harm.loc[((univ_harm[\"merge_iter\"]>0) & (univ_harm[\"Country\"]==\"Italy\")&\n", " (univ_harm[\"Institution\"].str.lower().str.contains(\"sapien\"))&\n", " (univ_harm[\"Institution\"].str.lower().str.contains(\"univ\"))), \"Institution_harm\"] = \"Sapienza Univ Rome\"" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 14, "outputs": [ { "data": { "text/plain": "Institution 19821\nInstitution_harm 16646\ndtype: int64" }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "univ_harm[[\"Institution\",\"Institution_harm\"]].nunique()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 15, "outputs": [ { "data": { "text/plain": " UT (Unique WOS ID) Institution Country \n124019 WOS:000663304500011 Excellium Consulting United Kingdom \\\n126511 WOS:000674472400006 Tongji Univ China \n33359 WOS:000391252900006 Beihang Univ China \n153729 WOS:000787596500003 Fudan Univ China \n69317 WOS:000467564700105 Guangdong Univ Technol China \n... ... ... ... \n160384 WOS:000812531900012 Zhengzhou Univ China \n133501 WOS:000702637000007 Univ Luxembourg Luxembourg \n140349 WOS:000728149000027 Regina Montis Regalis Hosp Italy \n160632 WOS:000813959600003 Liyang Peoples Hosp China \n174735 WOS:000888555700002 Swiss Fed Inst Technol Switzerland \n\n Institution_harm merge_iter \n124019 Excellium Consulting 0 \n126511 Tongji Univ 0 \n33359 Beihang Univ 0 \n153729 Fudan Univ 0 \n69317 Guangdong Univ Technol 0 \n... ... ... \n160384 Zhengzhou Univ 0 \n133501 Univ Luxembourg 0 \n140349 Regina Montis Regalis Hosp 0 \n160632 Liyang Peoples Hosp 0 \n174735 Swiss Fed Inst Technol 0 \n\n[500 rows x 5 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
UT (Unique WOS ID)InstitutionCountryInstitution_harmmerge_iter
124019WOS:000663304500011Excellium ConsultingUnited KingdomExcellium Consulting0
126511WOS:000674472400006Tongji UnivChinaTongji Univ0
33359WOS:000391252900006Beihang UnivChinaBeihang Univ0
153729WOS:000787596500003Fudan UnivChinaFudan Univ0
69317WOS:000467564700105Guangdong Univ TechnolChinaGuangdong Univ Technol0
..................
160384WOS:000812531900012Zhengzhou UnivChinaZhengzhou Univ0
133501WOS:000702637000007Univ LuxembourgLuxembourgUniv Luxembourg0
140349WOS:000728149000027Regina Montis Regalis HospItalyRegina Montis Regalis Hosp0
160632WOS:000813959600003Liyang Peoples HospChinaLiyang Peoples Hosp0
174735WOS:000888555700002Swiss Fed Inst TechnolSwitzerlandSwiss Fed Inst Technol0
\n

500 rows × 5 columns

\n
" }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "univ_harm.sample(500)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 17, "outputs": [ { "data": { "text/plain": "merge_iter\n0 174128\n1 3966\n2 521\n3 22\n4 1\nName: count, dtype: int64" }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "univ_harm[\"merge_iter\"].value_counts()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 15, "outputs": [], "source": [ "univ_harm.to_excel(f\"{outdir}/wos_institution_locations_harmonized.xlsx\", index=False)" ], "metadata": { "collapsed": false } } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 0 }