|
|
|
@ -0,0 +1,440 @@
|
|
|
|
|
{
|
|
|
|
|
"cells": [
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 23,
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"import pandas as pd\n",
|
|
|
|
|
"# Importing libraries and module and some setting for notebook\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"import pandas as pd\n",
|
|
|
|
|
"import re\n",
|
|
|
|
|
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
|
|
|
|
"import numpy as np\n",
|
|
|
|
|
"from scipy.sparse import csr_matrix\n",
|
|
|
|
|
"import sparse_dot_topn.sparse_dot_topn as ct #Cosine Similarity\n",
|
|
|
|
|
"import time\n",
|
|
|
|
|
"from tqdm import tqdm"
|
|
|
|
|
],
|
|
|
|
|
"metadata": {
|
|
|
|
|
"collapsed": false
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 5,
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"def wikinorm(univ_string):\n",
|
|
|
|
|
" from googlesearch import search\n",
|
|
|
|
|
" from nltk.metrics import edit_distance\n",
|
|
|
|
|
" from operator import itemgetter\n",
|
|
|
|
|
" from numpy.random import default_rng\n",
|
|
|
|
|
" rng = default_rng()\n",
|
|
|
|
|
" results = search(univ_string, lang=\"en\", num_results=3,advanced=True, sleep_interval=rng.uniform(1, 5))\n",
|
|
|
|
|
" univ_name = univ_string.split(\",\")[0]\n",
|
|
|
|
|
" u_results = [i.title for i in results if \"Category:\" not in i.title]\n",
|
|
|
|
|
" return sorted([tuple((j,edit_distance(univ_name, j))) for j in u_results],key=itemgetter(1))[0][0]\n"
|
|
|
|
|
],
|
|
|
|
|
"metadata": {
|
|
|
|
|
"collapsed": false
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 68,
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"def replace_uppercase_words(text):\n",
|
|
|
|
|
" words = text.split()\n",
|
|
|
|
|
" all_uppercase = all(word.isupper() for word in words)\n",
|
|
|
|
|
" all_lowercase = all(word.islower() for word in words)\n",
|
|
|
|
|
" if all_uppercase or all_lowercase:\n",
|
|
|
|
|
" return text\n",
|
|
|
|
|
" else:\n",
|
|
|
|
|
" result = []\n",
|
|
|
|
|
" for word in words:\n",
|
|
|
|
|
" w = word.strip()\n",
|
|
|
|
|
" if not w.isupper() and not w.islower():\n",
|
|
|
|
|
" result.append(w)\n",
|
|
|
|
|
" return \" \".join(result).strip()"
|
|
|
|
|
],
|
|
|
|
|
"metadata": {
|
|
|
|
|
"collapsed": false
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 69,
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"INFO: Pandarallel will run on 4 workers.\n",
|
|
|
|
|
"INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"WARNING: You are on Windows. If you detect any issue with pandarallel, be sure you checked out the Troubleshooting page:\n",
|
|
|
|
|
"https://nalepae.github.io/pandarallel/troubleshooting/\n"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/plain": "VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=38767), Label(value='0 / 38767')))…",
|
|
|
|
|
"application/vnd.jupyter.widget-view+json": {
|
|
|
|
|
"version_major": 2,
|
|
|
|
|
"version_minor": 0,
|
|
|
|
|
"model_id": "ee2cde76498b4a46a2e87ea6c971aed9"
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "display_data"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"155067\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"outdir=\"wos_processed_data\"\n",
|
|
|
|
|
"univ = pd.read_excel(f\"{outdir}/wos_institution_locations.xlsx\")\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"from pandarallel import pandarallel\n",
|
|
|
|
|
"pandarallel.initialize(progress_bar=True, nb_workers=4)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"univ[\"Institution_harm\"] = univ[\"Institution\"].parallel_apply(replace_uppercase_words)\n",
|
|
|
|
|
"print(len(univ))"
|
|
|
|
|
],
|
|
|
|
|
"metadata": {
|
|
|
|
|
"collapsed": false
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 39,
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/plain": " UT (Unique WOS ID) Institution Country \n84810 WOS:000571399800004 Univ Birmingham United Kingdom \\\n122264 WOS:000732918800001 Univ Southampton United Kingdom \n135675 WOS:000799234000004 UCL United Kingdom \n153134 WOS:000900724501058 Kore Univ Enna Italy \n51445 WOS:000455277600005 Univ Sheffield United Kingdom \n... ... ... ... \n21043 WOS:000372583700005 Vrije Univ Amsterdam Netherlands \n1938 WOS:000297611600011 Univ Essex United Kingdom \n64691 WOS:000490430500091 Xian Jiaotong Liverpool Univ China \n25740 WOS:000386793200001 Chinese Acad Sci China \n112682 WOS:000696110800001 Dalian Univ Technol China \n\n Institution_harm \n84810 Univ Birmingham \n122264 Univ Southampton \n135675 UCL \n153134 Kore Univ Enna \n51445 Univ Sheffield \n... ... \n21043 Vrije Univ Amsterdam \n1938 Univ Essex \n64691 Xian Jiaotong Liverpool Univ \n25740 Chinese Acad Sci \n112682 Dalian Univ Technol \n\n[100 rows x 4 columns]",
|
|
|
|
|
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Institution</th>\n <th>Country</th>\n <th>Institution_harm</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>84810</th>\n <td>WOS:000571399800004</td>\n <td>Univ Birmingham</td>\n <td>United Kingdom</td>\n <td>Univ Birmingham</td>\n </tr>\n <tr>\n <th>122264</th>\n <td>WOS:000732918800001</td>\n <td>Univ Southampton</td>\n <td>United Kingdom</td>\n <td>Univ Southampton</td>\n </tr>\n <tr>\n <th>135675</th>\n <td>WOS:000799234000004</td>\n <td>UCL</td>\n <td>United Kingdom</td>\n <td>UCL</td>\n </tr>\n <tr>\n <th>153134</th>\n <td>WOS:000900724501058</td>\n <td>Kore Univ Enna</td>\n <td>Italy</td>\n <td>Kore Univ Enna</td>\n </tr>\n <tr>\n <th>51445</th>\n <td>WOS:000455277600005</td>\n <td>Univ Sheffield</td>\n <td>United Kingdom</td>\n <td>Univ Sheffield</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>21043</th>\n <td>WOS:000372583700005</td>\n <td>Vrije Univ Amsterdam</td>\n <td>Netherlands</td>\n <td>Vrije Univ Amsterdam</td>\n </tr>\n <tr>\n <th>1938</th>\n <td>WOS:000297611600011</td>\n <td>Univ Essex</td>\n <td>United Kingdom</td>\n <td>Univ Essex</td>\n </tr>\n <tr>\n <th>64691</th>\n <td>WOS:000490430500091</td>\n <td>Xian Jiaotong Liverpool Univ</td>\n <td>China</td>\n <td>Xian Jiaotong Liverpool Univ</td>\n </tr>\n <tr>\n <th>25740</th>\n <td>WOS:000386793200001</td>\n <td>Chinese Acad Sci</td>\n <td>China</td>\n <td>Chinese Acad Sci</td>\n </tr>\n <tr>\n <th>112682</th>\n <td>WOS:000696110800001</td>\n <td>Dalian Univ Technol</td>\n <td>China</td>\n <td>Dalian Univ Technol</td>\n </tr>\n </tbody>\n</table>\n<p>100 rows × 4 columns</p>\n</div>"
|
|
|
|
|
},
|
|
|
|
|
"execution_count": 39,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"univ.sample(100)"
|
|
|
|
|
],
|
|
|
|
|
"metadata": {
|
|
|
|
|
"collapsed": false
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 71,
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/plain": " Country Institution_harm \n7736 France Yncrea Ouest \\\n13752 Spain Univ Carlos \n15855 United Kingdom Northumbria Univ Newcastle Upon Tyne \n12514 Norway Nord Univ \n602 China Henan Polytech Univ \n... ... ... \n11620 Italy Deep Blue Srl \n11183 Italy Univ Giustino Fortunato \n7433 Estonia Platinum Software Dev Co \n5129 China State & Local Joint Engn Lab Estuarine Hydraul Te \n6799 China MOA \n\n count \n7736 9 \n13752 1 \n15855 1 \n12514 1 \n602 87 \n... ... \n11620 1 \n11183 3 \n7433 1 \n5129 1 \n6799 1 \n\n[100 rows x 3 columns]",
|
|
|
|
|
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Country</th>\n <th>Institution_harm</th>\n <th>count</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>7736</th>\n <td>France</td>\n <td>Yncrea Ouest</td>\n <td>9</td>\n </tr>\n <tr>\n <th>13752</th>\n <td>Spain</td>\n <td>Univ Carlos</td>\n <td>1</td>\n </tr>\n <tr>\n <th>15855</th>\n <td>United Kingdom</td>\n <td>Northumbria Univ Newcastle Upon Tyne</td>\n <td>1</td>\n </tr>\n <tr>\n <th>12514</th>\n <td>Norway</td>\n <td>Nord Univ</td>\n <td>1</td>\n </tr>\n <tr>\n <th>602</th>\n <td>China</td>\n <td>Henan Polytech Univ</td>\n <td>87</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>11620</th>\n <td>Italy</td>\n <td>Deep Blue Srl</td>\n <td>1</td>\n </tr>\n <tr>\n <th>11183</th>\n <td>Italy</td>\n <td>Univ Giustino Fortunato</td>\n <td>3</td>\n </tr>\n <tr>\n <th>7433</th>\n <td>Estonia</td>\n <td>Platinum Software Dev Co</td>\n <td>1</td>\n </tr>\n <tr>\n <th>5129</th>\n <td>China</td>\n <td>State & Local Joint Engn Lab Estuarine Hydraul Te</td>\n <td>1</td>\n </tr>\n <tr>\n <th>6799</th>\n <td>China</td>\n <td>MOA</td>\n <td>1</td>\n </tr>\n </tbody>\n</table>\n<p>100 rows × 3 columns</p>\n</div>"
|
|
|
|
|
},
|
|
|
|
|
"execution_count": 71,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"univ_norm = univ.groupby(\"Country\", as_index=False)[\"Institution_harm\"].value_counts()\n",
|
|
|
|
|
"# univ_norm[\"search_for\"] = univ_norm[\"Institution\"]+\", \" + univ_norm[\"Country\"]+ \", wikipedia\"\n",
|
|
|
|
|
"univ_norm.sample(100)"
|
|
|
|
|
],
|
|
|
|
|
"metadata": {
|
|
|
|
|
"collapsed": false
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 72,
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# from pandarallel import pandarallel\n",
|
|
|
|
|
"# pandarallel.initialize(progress_bar=True, nb_workers=2)\n",
|
|
|
|
|
"#\n",
|
|
|
|
|
"# df_sample[\"search_result\"] = df_sample[\"search_for\"].parallel_apply(wikinorm)"
|
|
|
|
|
],
|
|
|
|
|
"metadata": {
|
|
|
|
|
"collapsed": false
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 73,
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/plain": "['Austria', 'Belgium']"
|
|
|
|
|
},
|
|
|
|
|
"execution_count": 73,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"list(univ_norm[\"Country\"].unique())[0:2]"
|
|
|
|
|
],
|
|
|
|
|
"metadata": {
|
|
|
|
|
"collapsed": false
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 95,
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"def ngrams(string, n=3):\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" string = re.sub(r'[,-./]|\\sBD',r'', string)\n",
|
|
|
|
|
" ngrams = zip(*[string[i:] for i in range(n)])\n",
|
|
|
|
|
" return [''.join(ngram) for ngram in ngrams]\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# calculate the similarity between two vectors of TF-IDF values the Cosine Similarity is usually used.\n",
|
|
|
|
|
"# result matrix in a very sparse terms and Scikit-learn deals with this nicely by returning a sparse CSR matrix.\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"def awesome_cossim_top(A, B, ntop, lower_bound=0):\n",
|
|
|
|
|
" # force A and B as a CSR matrix.\n",
|
|
|
|
|
" # If they have already been CSR, there is no overhead\n",
|
|
|
|
|
" A = A.tocsr()\n",
|
|
|
|
|
" B = B.tocsr()\n",
|
|
|
|
|
" M, _ = A.shape\n",
|
|
|
|
|
" _, N = B.shape\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" idx_dtype = np.int32\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" nnz_max = M*ntop\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" indptr = np.zeros(M+1, dtype=idx_dtype)\n",
|
|
|
|
|
" indices = np.zeros(nnz_max, dtype=idx_dtype)\n",
|
|
|
|
|
" data = np.zeros(nnz_max, dtype=A.dtype)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" ct.sparse_dot_topn(\n",
|
|
|
|
|
" M, N, np.asarray(A.indptr, dtype=idx_dtype),\n",
|
|
|
|
|
" np.asarray(A.indices, dtype=idx_dtype),\n",
|
|
|
|
|
" A.data,\n",
|
|
|
|
|
" np.asarray(B.indptr, dtype=idx_dtype),\n",
|
|
|
|
|
" np.asarray(B.indices, dtype=idx_dtype),\n",
|
|
|
|
|
" B.data,\n",
|
|
|
|
|
" ntop,\n",
|
|
|
|
|
" lower_bound,\n",
|
|
|
|
|
" indptr, indices, data)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" return csr_matrix((data,indices,indptr),shape=(M,N))\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# unpacks the resulting sparse matrix\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"def get_matches_df(sparse_matrix, name_vector, top=None):\n",
|
|
|
|
|
" non_zeros = sparse_matrix.nonzero()\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" sparserows = non_zeros[0]\n",
|
|
|
|
|
" sparsecols = non_zeros[1]\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" if top:\n",
|
|
|
|
|
" nr_matches = top\n",
|
|
|
|
|
" else:\n",
|
|
|
|
|
" nr_matches = sparsecols.size\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" left_side = np.empty([nr_matches], dtype=object)\n",
|
|
|
|
|
" right_side = np.empty([nr_matches], dtype=object)\n",
|
|
|
|
|
" similarity = np.zeros(nr_matches)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" for index in range(0, nr_matches):\n",
|
|
|
|
|
" left_side[index] = name_vector[sparserows[index]]\n",
|
|
|
|
|
" right_side[index] = name_vector[sparsecols[index]]\n",
|
|
|
|
|
" similarity[index] = sparse_matrix.data[index]\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" return pd.DataFrame({'left_side': left_side,\n",
|
|
|
|
|
" 'right_side': right_side,\n",
|
|
|
|
|
" 'similarity': similarity})\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"def discrepancy_filter(df):\n",
|
|
|
|
|
" f_df = df.copy()\n",
|
|
|
|
|
" tokenlist = [\"Med\", \"Hosp\", \"Tech\", \"Univ\", \"Acad\", \"Poly\"]\n",
|
|
|
|
|
" for token in tokenlist:\n",
|
|
|
|
|
" f_df = f_df[~(((f_df[\"right_side\"].str.contains(token))&\n",
|
|
|
|
|
" (~f_df[\"left_side\"].str.contains(token)))\n",
|
|
|
|
|
" |\n",
|
|
|
|
|
" ((f_df[\"left_side\"].str.contains(token))&\n",
|
|
|
|
|
" (~f_df[\"right_side\"].str.contains(token))))].copy()\n",
|
|
|
|
|
" return f_df\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# Define a function to get the high and low counts for each row\n",
|
|
|
|
|
"def get_high_low_counts(row):\n",
|
|
|
|
|
" if row['left_count'] > row['right_count']:\n",
|
|
|
|
|
" high_count = row['left_count']\n",
|
|
|
|
|
" low_count = row['right_count']\n",
|
|
|
|
|
" else: #row['left_count'] < row['right_count']:\n",
|
|
|
|
|
" high_count = row['right_count']\n",
|
|
|
|
|
" low_count = row['left_count']\n",
|
|
|
|
|
" # else:\n",
|
|
|
|
|
" # if len(row['left_side']) > len(row['right_side']):\n",
|
|
|
|
|
" # high_count = len(row['left_side'])\n",
|
|
|
|
|
" # low_count = len(row['right_side'])\n",
|
|
|
|
|
" # else:\n",
|
|
|
|
|
" # high_count = len(row['right_side'])\n",
|
|
|
|
|
" # low_count = len(row['left_side'])\n",
|
|
|
|
|
" return pd.Series([high_count, low_count])"
|
|
|
|
|
],
|
|
|
|
|
"metadata": {
|
|
|
|
|
"collapsed": false
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 130,
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stderr",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"30it [00:00, 53.27it/s]\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"merger = pd.DataFrame()\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"for i in tqdm(filter(lambda c: c!=\"China\", list(univ_norm[\"Country\"].unique()))):\n",
|
|
|
|
|
" sub_inst = univ_norm[univ_norm[\"Country\"]==i].reset_index()\n",
|
|
|
|
|
" types = sub_inst['Institution_harm']\n",
|
|
|
|
|
" vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)\n",
|
|
|
|
|
" tf_idf_matrix = vectorizer.fit_transform(types)\n",
|
|
|
|
|
" t1 = time.time()\n",
|
|
|
|
|
" matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.8)\n",
|
|
|
|
|
" t = time.time()-t1\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" # store the matches into new dataframe called matched_df and printing 10 samples\n",
|
|
|
|
|
" matches_df = get_matches_df(matches, types)\n",
|
|
|
|
|
" matches_df = matches_df[matches_df['similarity'] < 0.99999] # For removing all exact matches\n",
|
|
|
|
|
" matches_df = discrepancy_filter(matches_df).reset_index(drop=True)\n",
|
|
|
|
|
" matches_df[\"Country\"] = i\n",
|
|
|
|
|
" # matches_df = matches_df[pd.DataFrame(np.sort(matches_df[['left_side','right_side']].values,1)).duplicated()]\n",
|
|
|
|
|
" # matches_df = matches_df[~matches_df[['left_side', 'right_side']].apply(frozenset, axis=1).duplicated()]\n",
|
|
|
|
|
" merger = pd.concat([merger,matches_df], ignore_index=True)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"for s in [\"left\",\"right\"]:\n",
|
|
|
|
|
" merger[f\"{s}_count\"] = merger[f\"{s}_side\"].apply(lambda x: len(univ[univ[\"Institution_harm\"] == x]))\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# Apply the function to create a new column\n",
|
|
|
|
|
"merger[['high_count', 'low_count']] = merger.apply(get_high_low_counts, axis=1)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# Use apply again to create the high_side and low_side columns\n",
|
|
|
|
|
"merger['high_side'] = merger.apply(lambda row: row['left_side'] if row['left_count'] > row['right_count'] else row['right_side'], axis=1)\n",
|
|
|
|
|
"merger['low_side'] = merger.apply(lambda row: row['left_side'] if row['left_count'] <= row['right_count'] else row['right_side'], axis=1)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# Drop the high_count and low_count columns if they are not needed\n",
|
|
|
|
|
"# merger.drop(['high_count', 'low_count'], axis=1, inplace=True)"
|
|
|
|
|
],
|
|
|
|
|
"metadata": {
|
|
|
|
|
"collapsed": false
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"source": [],
|
|
|
|
|
"metadata": {
|
|
|
|
|
"collapsed": false
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 131,
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"1192\n"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"name": "stderr",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"1192it [00:44, 26.68it/s]\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"fuzzymerger = merger[[\"Country\",\"low_side\",\"high_side\",\"high_count\",\"low_count\",\"similarity\"]].drop_duplicates()\n",
|
|
|
|
|
"fuzzymerger = fuzzymerger.sort_values(by=[\"low_side\",\"high_count\"], ascending=[True,False])\n",
|
|
|
|
|
"fuzzymerger = fuzzymerger.drop_duplicates(subset=[\"Country\",\"low_side\"]).sort_values(by=\"high_count\", ascending=True).reset_index(drop=True)\n",
|
|
|
|
|
"print(len(fuzzymerger))\n",
|
|
|
|
|
"univ_harm = univ.copy()\n",
|
|
|
|
|
"univ_harm[\"merge_iter\"] = 0\n",
|
|
|
|
|
"for i,row in tqdm(fuzzymerger.iterrows()):\n",
|
|
|
|
|
" univ_harm.loc[((univ_harm[\"Country\"]==row[\"Country\"])&\n",
|
|
|
|
|
" (univ_harm[\"Institution_harm\"]==row[\"low_side\"])),\"merge_iter\"] += 1\n",
|
|
|
|
|
" univ_harm.loc[((univ_harm[\"Country\"]==row[\"Country\"])&\n",
|
|
|
|
|
" (univ_harm[\"Institution_harm\"]==row[\"low_side\"])),\"Institution_harm\"] = row[\"high_side\"]"
|
|
|
|
|
],
|
|
|
|
|
"metadata": {
|
|
|
|
|
"collapsed": false
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"source": [],
|
|
|
|
|
"metadata": {
|
|
|
|
|
"collapsed": false
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 120,
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/plain": " UT (Unique WOS ID) Institution Country \n244 WOS:000286472300003 Univ Trent Italy \\\n364 WOS:000287586100011 Univ Trent Italy \n410 WOS:000287939200011 Abdus Salam Int Ctr Theoret Phys Italy \n765 WOS:000290996200002 Univ Trent Italy \n907 WOS:000291698400013 INFN Sez Roma 1 Italy \n... ... ... ... \n153063 WOS:000900129900175 Univ Rome Campus Biomed Aquila Italy \n154775 WOS:000929737300001 Prevent & Res Inst Italy \n154813 WOS:000929737300001 Ist Super Sanit Italy \n154855 WOS:000933331200004 Univ Federio II Italy \n154857 WOS:000933331200004 INAF Osservatorio Astron Capodimonte Italy \n\n Institution_harm merge_iter \n244 Univ Trento 1 \n364 Univ Trento 1 \n410 Abdus Salaam Int Ctr Theoret Phys 1 \n765 Univ Trento 1 \n907 Sez Roma 1 \n... ... ... \n153063 Univ Rome Campus Biomed LAquila 1 \n154775 Prevent & Res Inst 2 \n154813 Ist Super Sanita 1 \n154855 Univ Federio 2 \n154857 Osserv Astron Capodimonte 1 \n\n[375 rows x 5 columns]",
|
|
|
|
|
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Institution</th>\n <th>Country</th>\n <th>Institution_harm</th>\n <th>merge_iter</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>244</th>\n <td>WOS:000286472300003</td>\n <td>Univ Trent</td>\n <td>Italy</td>\n <td>Univ Trento</td>\n <td>1</td>\n </tr>\n <tr>\n <th>364</th>\n <td>WOS:000287586100011</td>\n <td>Univ Trent</td>\n <td>Italy</td>\n <td>Univ Trento</td>\n <td>1</td>\n </tr>\n <tr>\n <th>410</th>\n <td>WOS:000287939200011</td>\n <td>Abdus Salam Int Ctr Theoret Phys</td>\n <td>Italy</td>\n <td>Abdus Salaam Int Ctr Theoret Phys</td>\n <td>1</td>\n </tr>\n <tr>\n <th>765</th>\n <td>WOS:000290996200002</td>\n <td>Univ Trent</td>\n <td>Italy</td>\n <td>Univ Trento</td>\n <td>1</td>\n </tr>\n <tr>\n <th>907</th>\n <td>WOS:000291698400013</td>\n <td>INFN Sez Roma 1</td>\n <td>Italy</td>\n <td>Sez Roma</td>\n <td>1</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>153063</th>\n <td>WOS:000900129900175</td>\n <td>Univ Rome Campus Biomed Aquila</td>\n <td>Italy</td>\n <td>Univ Rome Campus Biomed LAquila</td>\n <td>1</td>\n </tr>\n <tr>\n <th>154775</th>\n <td>WOS:000929737300001</td>\n <td>Prevent & Res Inst</td>\n <td>Italy</td>\n <td>Prevent & Res Inst</td>\n <td>2</td>\n </tr>\n <tr>\n <th>154813</th>\n <td>WOS:000929737300001</td>\n <td>Ist Super Sanit</td>\n <td>Italy</td>\n <td>Ist Super Sanita</td>\n <td>1</td>\n </tr>\n <tr>\n <th>154855</th>\n <td>WOS:000933331200004</td>\n <td>Univ Federio II</td>\n <td>Italy</td>\n <td>Univ Federio</td>\n <td>2</td>\n </tr>\n <tr>\n <th>154857</th>\n <td>WOS:000933331200004</td>\n <td>INAF Osservatorio Astron Capodimonte</td>\n <td>Italy</td>\n <td>Osserv Astron Capodimonte</td>\n <td>1</td>\n </tr>\n </tbody>\n</table>\n<p>375 rows × 5 columns</p>\n</div>"
|
|
|
|
|
},
|
|
|
|
|
"execution_count": 120,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"univ_harm[((univ_harm[\"merge_iter\"]>0) & (univ_harm[\"Country\"]==\"Italy\"))]"
|
|
|
|
|
],
|
|
|
|
|
"metadata": {
|
|
|
|
|
"collapsed": false
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"metadata": {
|
|
|
|
|
"kernelspec": {
|
|
|
|
|
"display_name": "Python 3",
|
|
|
|
|
"language": "python",
|
|
|
|
|
"name": "python3"
|
|
|
|
|
},
|
|
|
|
|
"language_info": {
|
|
|
|
|
"codemirror_mode": {
|
|
|
|
|
"name": "ipython",
|
|
|
|
|
"version": 2
|
|
|
|
|
},
|
|
|
|
|
"file_extension": ".py",
|
|
|
|
|
"mimetype": "text/x-python",
|
|
|
|
|
"name": "python",
|
|
|
|
|
"nbconvert_exporter": "python",
|
|
|
|
|
"pygments_lexer": "ipython2",
|
|
|
|
|
"version": "2.7.6"
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
"nbformat": 4,
|
|
|
|
|
"nbformat_minor": 0
|
|
|
|
|
}
|