{ "cells": [ { "cell_type": "code", "execution_count": 1, "outputs": [], "source": [ "import pandas as pd\n", "# Importing libraries and module and some setting for notebook\n", "\n", "import pandas as pd\n", "import re\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "import numpy as np\n", "from scipy.sparse import csr_matrix\n", "import sparse_dot_topn.sparse_dot_topn as ct #Cosine Similarity\n", "import time\n", "from tqdm import tqdm" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 2, "outputs": [], "source": [ "def wikinorm(univ_string):\n", " from googlesearch import search\n", " from nltk.metrics import edit_distance\n", " from operator import itemgetter\n", " from numpy.random import default_rng\n", " rng = default_rng()\n", " results = search(univ_string, lang=\"en\", num_results=3,advanced=True, sleep_interval=rng.uniform(1, 5))\n", " univ_name = univ_string.split(\",\")[0]\n", " u_results = [i.title for i in results if \"Category:\" not in i.title]\n", " return sorted([tuple((j,edit_distance(univ_name, j))) for j in u_results],key=itemgetter(1))[0][0]\n" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 3, "outputs": [], "source": [ "def replace_uppercase_words(text):\n", " words = text.split()\n", " all_uppercase = all(word.isupper() for word in words)\n", " all_lowercase = all(word.islower() for word in words)\n", " if all_uppercase or all_lowercase:\n", " return text\n", " else:\n", " result = []\n", " for word in words:\n", " w = word.strip()\n", " if not w.isupper() and not w.islower():\n", " result.append(w)\n", " return \" \".join(result).strip()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 4, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO: Pandarallel will run on 4 workers.\n", "INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.\n", "\n", "WARNING: You are on Windows. If you detect any issue with pandarallel, be sure you checked out the Troubleshooting page:\n", "https://nalepae.github.io/pandarallel/troubleshooting/\n" ] }, { "data": { "text/plain": "VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=44660), Label(value='0 / 44660')))…", "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "92c1cd6c14644ffeb042b38f5d5d98c5" } }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "178638\n" ] } ], "source": [ "outdir=\"wos_processed_data\"\n", "univ = pd.read_excel(f\"{outdir}/wos_institution_locations.xlsx\")\n", "\n", "from pandarallel import pandarallel\n", "pandarallel.initialize(progress_bar=True, nb_workers=4)\n", "\n", "univ[\"Institution_harm\"] = univ[\"Institution\"].parallel_apply(replace_uppercase_words)\n", "print(len(univ))" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 5, "outputs": [ { "data": { "text/plain": " UT (Unique WOS ID) Institution \n153271 WOS:000784587900008 Univ Pisa \\\n159800 WOS:000810042500002 China Japan Friendship Hosp \n130931 WOS:000691922800007 Karl Franzens Univ Graz \n1500 WOS:000292944600012 CNR \n113964 WOS:000618210000032 Karolinska Univ Hosp \n... ... ... \n160284 WOS:000812227000009 Univ Appl Sci Upper Austria \n29314 WOS:000381396400013 Univ Southampton \n17045 WOS:000347046200017 Charles Univ Prague \n164118 WOS:000832954200001 Nanjing Univ Aeronaut & Astronaut \n109992 WOS:000604257500070 KTH Royal Inst Technol \n\n Country Institution_harm \n153271 Italy Univ Pisa \n159800 China China Japan Friendship Hosp \n130931 Austria Karl Franzens Univ Graz \n1500 Italy CNR \n113964 Sweden Karolinska Univ Hosp \n... ... ... \n160284 Austria Univ Appl Sci Upper Austria \n29314 United Kingdom Univ Southampton \n17045 Czech Republic Charles Univ Prague \n164118 China Nanjing Univ Aeronaut & Astronaut \n109992 Sweden Royal Inst Technol \n\n[100 rows x 4 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
UT (Unique WOS ID)InstitutionCountryInstitution_harm
153271WOS:000784587900008Univ PisaItalyUniv Pisa
159800WOS:000810042500002China Japan Friendship HospChinaChina Japan Friendship Hosp
130931WOS:000691922800007Karl Franzens Univ GrazAustriaKarl Franzens Univ Graz
1500WOS:000292944600012CNRItalyCNR
113964WOS:000618210000032Karolinska Univ HospSwedenKarolinska Univ Hosp
...............
160284WOS:000812227000009Univ Appl Sci Upper AustriaAustriaUniv Appl Sci Upper Austria
29314WOS:000381396400013Univ SouthamptonUnited KingdomUniv Southampton
17045WOS:000347046200017Charles Univ PragueCzech RepublicCharles Univ Prague
164118WOS:000832954200001Nanjing Univ Aeronaut & AstronautChinaNanjing Univ Aeronaut & Astronaut
109992WOS:000604257500070KTH Royal Inst TechnolSwedenRoyal Inst Technol
\n

100 rows × 4 columns

\n
" }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "univ.sample(100)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 6, "outputs": [ { "data": { "text/plain": " Country Institution_harm count\n8168 Croatia Inst Adriat Crops & Karst Reclamat 1\n3417 China Ctr Eye & Vis Res 1\n1034 China Westlake Inst Adv Study 13\n13427 Italy Macerata Hosp 1\n8071 China Key Lab Ecoind Green Technol Fujian Prov 1\n... ... ... ...\n17230 United Kingdom Univ Kingston 6\n8847 France Univ Artois 8\n16071 Spain Catalonia Geriatr & Gerontol Soc 1\n6357 China Wuxi Huace Elect Syst Co Ltd 1\n9049 France Excelia Business Sch 3\n\n[100 rows x 3 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
CountryInstitution_harmcount
8168CroatiaInst Adriat Crops & Karst Reclamat1
3417ChinaCtr Eye & Vis Res1
1034ChinaWestlake Inst Adv Study13
13427ItalyMacerata Hosp1
8071ChinaKey Lab Ecoind Green Technol Fujian Prov1
............
17230United KingdomUniv Kingston6
8847FranceUniv Artois8
16071SpainCatalonia Geriatr & Gerontol Soc1
6357ChinaWuxi Huace Elect Syst Co Ltd1
9049FranceExcelia Business Sch3
\n

100 rows × 3 columns

\n
" }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "univ_norm = univ.groupby(\"Country\", as_index=False)[\"Institution_harm\"].value_counts()\n", "# univ_norm[\"search_for\"] = univ_norm[\"Institution\"]+\", \" + univ_norm[\"Country\"]+ \", wikipedia\"\n", "univ_norm.sample(100)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 7, "outputs": [], "source": [ "# from pandarallel import pandarallel\n", "# pandarallel.initialize(progress_bar=True, nb_workers=2)\n", "#\n", "# df_sample[\"search_result\"] = df_sample[\"search_for\"].parallel_apply(wikinorm)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 8, "outputs": [], "source": [ "def ngrams(string, n=3):\n", "\n", " string = re.sub(r'[,-./]|\\sBD',r'', string)\n", " ngrams = zip(*[string[i:] for i in range(n)])\n", " return [''.join(ngram) for ngram in ngrams]\n", "\n", "# calculate the similarity between two vectors of TF-IDF values the Cosine Similarity is usually used.\n", "# result matrix in a very sparse terms and Scikit-learn deals with this nicely by returning a sparse CSR matrix.\n", "\n", "def awesome_cossim_top(A, B, ntop, lower_bound=0):\n", " # force A and B as a CSR matrix.\n", " # If they have already been CSR, there is no overhead\n", " A = A.tocsr()\n", " B = B.tocsr()\n", " M, _ = A.shape\n", " _, N = B.shape\n", "\n", " idx_dtype = np.int32\n", "\n", " nnz_max = M*ntop\n", "\n", " indptr = np.zeros(M+1, dtype=idx_dtype)\n", " indices = np.zeros(nnz_max, dtype=idx_dtype)\n", " data = np.zeros(nnz_max, dtype=A.dtype)\n", "\n", " ct.sparse_dot_topn(\n", " M, N, np.asarray(A.indptr, dtype=idx_dtype),\n", " np.asarray(A.indices, dtype=idx_dtype),\n", " A.data,\n", " np.asarray(B.indptr, dtype=idx_dtype),\n", " np.asarray(B.indices, dtype=idx_dtype),\n", " B.data,\n", " ntop,\n", " lower_bound,\n", " indptr, indices, data)\n", "\n", " return csr_matrix((data,indices,indptr),shape=(M,N))\n", "\n", "# unpacks the resulting sparse matrix\n", "\n", "def get_matches_df(sparse_matrix, name_vector, top=None):\n", " non_zeros = sparse_matrix.nonzero()\n", "\n", " sparserows = non_zeros[0]\n", " sparsecols = non_zeros[1]\n", "\n", " if top:\n", " nr_matches = top\n", " else:\n", " nr_matches = sparsecols.size\n", "\n", " left_side = np.empty([nr_matches], dtype=object)\n", " right_side = np.empty([nr_matches], dtype=object)\n", " similarity = np.zeros(nr_matches)\n", "\n", " for index in range(0, nr_matches):\n", " left_side[index] = name_vector[sparserows[index]]\n", " right_side[index] = name_vector[sparsecols[index]]\n", " similarity[index] = sparse_matrix.data[index]\n", "\n", " return pd.DataFrame({'left_side': left_side,\n", " 'right_side': right_side,\n", " 'similarity': similarity})\n", "\n", "\n", "def discrepancy_filter(df):\n", " f_df = df.copy()\n", " tokenlist = [\"Med\", \"Hosp\", \"Tech\", \"Univ\", \"Acad\", \"Poly\"]\n", " for token in tokenlist:\n", " f_df = f_df[~(((f_df[\"right_side\"].str.contains(token))&\n", " (~f_df[\"left_side\"].str.contains(token)))\n", " |\n", " ((f_df[\"left_side\"].str.contains(token))&\n", " (~f_df[\"right_side\"].str.contains(token))))].copy()\n", " return f_df\n", "\n", "\n", "# Define a function to get the high and low counts for each row\n", "def get_high_low_counts(row):\n", " if row['left_count'] > row['right_count']:\n", " high_count = row['left_count']\n", " low_count = row['right_count']\n", " else: #row['left_count'] < row['right_count']:\n", " high_count = row['right_count']\n", " low_count = row['left_count']\n", " # else:\n", " # if len(row['left_side']) > len(row['right_side']):\n", " # high_count = len(row['left_side'])\n", " # low_count = len(row['right_side'])\n", " # else:\n", " # high_count = len(row['right_side'])\n", " # low_count = len(row['left_side'])\n", " return pd.Series([high_count, low_count])" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 9, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 31/31 [00:00<00:00, 31.97it/s]\n" ] } ], "source": [ "merger = pd.DataFrame()\n", "\n", "# for i in tqdm(filter(lambda c: c!=\"China\", list(univ_norm[\"Country\"].unique()))):\n", "for i in tqdm(list(univ_norm[\"Country\"].unique())):\n", " sub_inst = univ_norm[univ_norm[\"Country\"]==i].reset_index()\n", " types = sub_inst['Institution_harm']\n", " vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)\n", " tf_idf_matrix = vectorizer.fit_transform(types)\n", " t1 = time.time()\n", " matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.8 if i!=\"China\" else 0.9)\n", " t = time.time()-t1\n", "\n", " # store the matches into new dataframe called matched_df and printing 10 samples\n", " matches_df = get_matches_df(matches, types)\n", " matches_df = matches_df[matches_df['similarity'] < 0.99999] # For removing all exact matches\n", " matches_df = discrepancy_filter(matches_df).reset_index(drop=True)\n", " matches_df[\"Country\"] = i\n", " # matches_df = matches_df[pd.DataFrame(np.sort(matches_df[['left_side','right_side']].values,1)).duplicated()]\n", " # matches_df = matches_df[~matches_df[['left_side', 'right_side']].apply(frozenset, axis=1).duplicated()]\n", " merger = pd.concat([merger,matches_df], ignore_index=True)\n", "\n", "for s in [\"left\",\"right\"]:\n", " merger[f\"{s}_count\"] = merger[f\"{s}_side\"].apply(lambda x: len(univ[univ[\"Institution_harm\"] == x]))\n", "\n", "# Apply the function to create a new column\n", "merger[['high_count', 'low_count']] = merger.apply(get_high_low_counts, axis=1)\n", "\n", "# Use apply again to create the high_side and low_side columns\n", "merger['high_side'] = merger.apply(lambda row: row['left_side'] if row['left_count'] > row['right_count'] else row['right_side'], axis=1)\n", "merger['low_side'] = merger.apply(lambda row: row['left_side'] if row['left_count'] <= row['right_count'] else row['right_side'], axis=1)\n", "\n", "# Drop the high_count and low_count columns if they are not needed\n", "# merger.drop(['high_count', 'low_count'], axis=1, inplace=True)" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 10, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1916\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "1916it [01:11, 26.94it/s]\n" ] } ], "source": [ "fuzzymerger = merger[[\"Country\",\"low_side\",\"high_side\",\"high_count\",\"low_count\",\"similarity\"]].drop_duplicates()\n", "fuzzymerger = fuzzymerger.sort_values(by=[\"low_side\",\"high_count\"], ascending=[True,False])\n", "fuzzymerger = fuzzymerger.drop_duplicates(subset=[\"Country\",\"low_side\"]).sort_values(by=\"high_count\", ascending=True).reset_index(drop=True)\n", "print(len(fuzzymerger))\n", "univ_harm = univ.copy()\n", "univ_harm[\"merge_iter\"] = 0\n", "for i,row in tqdm(fuzzymerger.iterrows()):\n", " univ_harm.loc[((univ_harm[\"Country\"]==row[\"Country\"])&\n", " (univ_harm[\"Institution_harm\"]==row[\"low_side\"])),\"merge_iter\"] += 1\n", " univ_harm.loc[((univ_harm[\"Country\"]==row[\"Country\"])&\n", " (univ_harm[\"Institution_harm\"]==row[\"low_side\"])),\"Institution_harm\"] = row[\"high_side\"]" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 11, "outputs": [], "source": [ "# fuzzymerger[fuzzymerger[\"Country\"]==\"China\"]" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 12, "outputs": [], "source": [ "# univ_harm[((univ_harm[\"merge_iter\"]>0) & (univ_harm[\"Country\"]==\"Italy\"))]" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 13, "outputs": [], "source": [ "univ_harm.loc[((univ_harm[\"merge_iter\"]>0) & (univ_harm[\"Country\"]==\"Italy\")&\n", " (univ_harm[\"Institution\"].str.lower().str.contains(\"sapien\"))&\n", " (univ_harm[\"Institution\"].str.lower().str.contains(\"univ\"))), \"Institution_harm\"] = \"Sapienza Univ Rome\"" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 14, "outputs": [ { "data": { "text/plain": "Institution 19821\nInstitution_harm 16646\ndtype: int64" }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "univ_harm[[\"Institution\",\"Institution_harm\"]].nunique()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 16, "outputs": [ { "data": { "text/plain": " UT (Unique WOS ID) Institution Country \n49282 WOS:000428099700011 Univ Sheffield United Kingdom \\\n51975 WOS:000432981300002 Chinese Acad Sci China \n64618 WOS:000459693000011 Babes Bolyai Univ Romania \n163145 WOS:000828102100001 Xidian Univ China \n99690 WOS:000566510600001 Fora Forest Technol Spain \n... ... ... ... \n1567 WOS:000293492500004 Univ Essex United Kingdom \n73076 WOS:000476471800022 Shanghai Univ China \n137096 WOS:000715426400001 Queen Mary Hosp China \n164978 WOS:000836819000003 Manchester Metropolitan Univ United Kingdom \n32973 WOS:000390181300013 Univ Complutense Madrid Spain \n\n Institution_harm merge_iter \n49282 Univ Sheffield 0 \n51975 Chinese Acad Sci 0 \n64618 Babes Bolyai Univ 0 \n163145 Xidian Univ 0 \n99690 Fora Forest Technol 0 \n... ... ... \n1567 Univ Essex 0 \n73076 Shanghai Univ 0 \n137096 Queen Mary Hosp 0 \n164978 Manchester Metropolitan Univ 0 \n32973 Univ Complutense Madrid 0 \n\n[500 rows x 5 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
UT (Unique WOS ID)InstitutionCountryInstitution_harmmerge_iter
49282WOS:000428099700011Univ SheffieldUnited KingdomUniv Sheffield0
51975WOS:000432981300002Chinese Acad SciChinaChinese Acad Sci0
64618WOS:000459693000011Babes Bolyai UnivRomaniaBabes Bolyai Univ0
163145WOS:000828102100001Xidian UnivChinaXidian Univ0
99690WOS:000566510600001Fora Forest TechnolSpainFora Forest Technol0
..................
1567WOS:000293492500004Univ EssexUnited KingdomUniv Essex0
73076WOS:000476471800022Shanghai UnivChinaShanghai Univ0
137096WOS:000715426400001Queen Mary HospChinaQueen Mary Hosp0
164978WOS:000836819000003Manchester Metropolitan UnivUnited KingdomManchester Metropolitan Univ0
32973WOS:000390181300013Univ Complutense MadridSpainUniv Complutense Madrid0
\n

500 rows × 5 columns

\n
" }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "univ_harm.sample(500)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 15, "outputs": [], "source": [ "univ_harm.to_excel(f\"{outdir}/wos_institution_locations_harmonized.xlsx\", index=False)" ], "metadata": { "collapsed": false } } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 0 }