general progress commit

main
radvanyimome 1 year ago
parent c156adaf05
commit 1e89b1d153

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 191,
"execution_count": 1,
"outputs": [],
"source": [
"import pandas as pd\n",
@ -23,7 +23,7 @@
},
{
"cell_type": "code",
"execution_count": 192,
"execution_count": 2,
"outputs": [],
"source": [
"def wikinorm(univ_string):\n",
@ -43,7 +43,7 @@
},
{
"cell_type": "code",
"execution_count": 193,
"execution_count": 3,
"outputs": [],
"source": [
"def replace_uppercase_words(text):\n",
@ -66,7 +66,7 @@
},
{
"cell_type": "code",
"execution_count": 194,
"execution_count": 4,
"outputs": [
{
"name": "stdout",
@ -81,11 +81,11 @@
},
{
"data": {
"text/plain": "VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=38767), Label(value='0 / 38767')))…",
"text/plain": "VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=44660), Label(value='0 / 44660')))…",
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "8551fdcfc52a43108a78c1e91915c681"
"model_id": "92c1cd6c14644ffeb042b38f5d5d98c5"
}
},
"metadata": {},
@ -95,7 +95,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"155067\n"
"178638\n"
]
}
],
@ -115,14 +115,14 @@
},
{
"cell_type": "code",
"execution_count": 195,
"execution_count": 5,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Institution \n1094 WOS:000292330300050 Hong Kong Polytech Univ \\\n21547 WOS:000374363900001 Guangdong Univ Technol \n53778 WOS:000459846300019 Aarhus Univ \n153776 WOS:000907044000014 Univ Siena \n81562 WOS:000554591602038 China Natl Elect Import Export Corp \n... ... ... \n29206 WOS:000397047200002 Univ Duisburg Essen \n21658 WOS:000374617600020 Univ Southampton \n43289 WOS:000434742800004 Univ Strathclyde \n37200 WOS:000418525100013 Goethe Univ Frankfurt \n95964 WOS:000616310200013 Eindhoven Univ Technol \n\n Country Institution_harm \n1094 China Hong Kong Polytech Univ \n21547 China Guangdong Univ Technol \n53778 Denmark Aarhus Univ \n153776 Italy Univ Siena \n81562 China China Natl Elect Import Export Corp \n... ... ... \n29206 Germany Univ Duisburg Essen \n21658 United Kingdom Univ Southampton \n43289 United Kingdom Univ Strathclyde \n37200 Germany Goethe Univ Frankfurt \n95964 Netherlands Eindhoven Univ Technol \n\n[100 rows x 4 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Institution</th>\n <th>Country</th>\n <th>Institution_harm</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>1094</th>\n <td>WOS:000292330300050</td>\n <td>Hong Kong Polytech Univ</td>\n <td>China</td>\n <td>Hong Kong Polytech Univ</td>\n </tr>\n <tr>\n <th>21547</th>\n <td>WOS:000374363900001</td>\n <td>Guangdong Univ Technol</td>\n <td>China</td>\n <td>Guangdong Univ Technol</td>\n </tr>\n <tr>\n <th>53778</th>\n <td>WOS:000459846300019</td>\n <td>Aarhus Univ</td>\n <td>Denmark</td>\n <td>Aarhus Univ</td>\n </tr>\n <tr>\n <th>153776</th>\n <td>WOS:000907044000014</td>\n <td>Univ Siena</td>\n <td>Italy</td>\n <td>Univ Siena</td>\n </tr>\n <tr>\n <th>81562</th>\n <td>WOS:000554591602038</td>\n <td>China Natl Elect Import Export Corp</td>\n <td>China</td>\n <td>China Natl Elect Import Export Corp</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>29206</th>\n <td>WOS:000397047200002</td>\n <td>Univ Duisburg Essen</td>\n <td>Germany</td>\n <td>Univ Duisburg Essen</td>\n </tr>\n <tr>\n <th>21658</th>\n <td>WOS:000374617600020</td>\n <td>Univ Southampton</td>\n <td>United Kingdom</td>\n <td>Univ Southampton</td>\n </tr>\n <tr>\n <th>43289</th>\n <td>WOS:000434742800004</td>\n <td>Univ Strathclyde</td>\n <td>United Kingdom</td>\n <td>Univ Strathclyde</td>\n </tr>\n <tr>\n <th>37200</th>\n <td>WOS:000418525100013</td>\n <td>Goethe Univ Frankfurt</td>\n <td>Germany</td>\n <td>Goethe Univ Frankfurt</td>\n </tr>\n <tr>\n <th>95964</th>\n <td>WOS:000616310200013</td>\n <td>Eindhoven Univ Technol</td>\n <td>Netherlands</td>\n <td>Eindhoven Univ Technol</td>\n </tr>\n </tbody>\n</table>\n<p>100 rows × 4 columns</p>\n</div>"
"text/plain": " UT (Unique WOS ID) Institution \n153271 WOS:000784587900008 Univ Pisa \\\n159800 WOS:000810042500002 China Japan Friendship Hosp \n130931 WOS:000691922800007 Karl Franzens Univ Graz \n1500 WOS:000292944600012 CNR \n113964 WOS:000618210000032 Karolinska Univ Hosp \n... ... ... \n160284 WOS:000812227000009 Univ Appl Sci Upper Austria \n29314 WOS:000381396400013 Univ Southampton \n17045 WOS:000347046200017 Charles Univ Prague \n164118 WOS:000832954200001 Nanjing Univ Aeronaut & Astronaut \n109992 WOS:000604257500070 KTH Royal Inst Technol \n\n Country Institution_harm \n153271 Italy Univ Pisa \n159800 China China Japan Friendship Hosp \n130931 Austria Karl Franzens Univ Graz \n1500 Italy CNR \n113964 Sweden Karolinska Univ Hosp \n... ... ... \n160284 Austria Univ Appl Sci Upper Austria \n29314 United Kingdom Univ Southampton \n17045 Czech Republic Charles Univ Prague \n164118 China Nanjing Univ Aeronaut & Astronaut \n109992 Sweden Royal Inst Technol \n\n[100 rows x 4 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Institution</th>\n <th>Country</th>\n <th>Institution_harm</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>153271</th>\n <td>WOS:000784587900008</td>\n <td>Univ Pisa</td>\n <td>Italy</td>\n <td>Univ Pisa</td>\n </tr>\n <tr>\n <th>159800</th>\n <td>WOS:000810042500002</td>\n <td>China Japan Friendship Hosp</td>\n <td>China</td>\n <td>China Japan Friendship Hosp</td>\n </tr>\n <tr>\n <th>130931</th>\n <td>WOS:000691922800007</td>\n <td>Karl Franzens Univ Graz</td>\n <td>Austria</td>\n <td>Karl Franzens Univ Graz</td>\n </tr>\n <tr>\n <th>1500</th>\n <td>WOS:000292944600012</td>\n <td>CNR</td>\n <td>Italy</td>\n <td>CNR</td>\n </tr>\n <tr>\n <th>113964</th>\n <td>WOS:000618210000032</td>\n <td>Karolinska Univ Hosp</td>\n <td>Sweden</td>\n <td>Karolinska Univ Hosp</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>160284</th>\n <td>WOS:000812227000009</td>\n <td>Univ Appl Sci Upper Austria</td>\n <td>Austria</td>\n <td>Univ Appl Sci Upper Austria</td>\n </tr>\n <tr>\n <th>29314</th>\n <td>WOS:000381396400013</td>\n <td>Univ Southampton</td>\n <td>United Kingdom</td>\n <td>Univ Southampton</td>\n </tr>\n <tr>\n <th>17045</th>\n <td>WOS:000347046200017</td>\n <td>Charles Univ Prague</td>\n <td>Czech Republic</td>\n <td>Charles Univ Prague</td>\n </tr>\n <tr>\n <th>164118</th>\n <td>WOS:000832954200001</td>\n <td>Nanjing Univ Aeronaut &amp; Astronaut</td>\n <td>China</td>\n <td>Nanjing Univ Aeronaut &amp; Astronaut</td>\n </tr>\n <tr>\n <th>109992</th>\n <td>WOS:000604257500070</td>\n <td>KTH Royal Inst Technol</td>\n <td>Sweden</td>\n <td>Royal Inst Technol</td>\n </tr>\n </tbody>\n</table>\n<p>100 rows × 4 columns</p>\n</div>"
},
"execution_count": 195,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@ -136,14 +136,14 @@
},
{
"cell_type": "code",
"execution_count": 196,
"execution_count": 6,
"outputs": [
{
"data": {
"text/plain": " Country Institution_harm count\n12655 Poland Space Res Ctr 6\n12940 Portugal Ctr Invest Energia State Grid 1\n616 China Minist Nat Resources 78\n5561 China PowerChina Huadong Engn Corp Ltd 1\n514 China Chongqing Univ 478\n... ... ... ...\n476 Bulgaria Tech Univ 1\n12454 Norway Stavanger Univ Hosp 9\n5489 China Shanghai Sports Sch 1\n768 China Hubei Univ 25\n13527 Spain Jimenez Diaz Univ Hosp 2\n\n[100 rows x 3 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Country</th>\n <th>Institution_harm</th>\n <th>count</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>12655</th>\n <td>Poland</td>\n <td>Space Res Ctr</td>\n <td>6</td>\n </tr>\n <tr>\n <th>12940</th>\n <td>Portugal</td>\n <td>Ctr Invest Energia State Grid</td>\n <td>1</td>\n </tr>\n <tr>\n <th>616</th>\n <td>China</td>\n <td>Minist Nat Resources</td>\n <td>78</td>\n </tr>\n <tr>\n <th>5561</th>\n <td>China</td>\n <td>PowerChina Huadong Engn Corp Ltd</td>\n <td>1</td>\n </tr>\n <tr>\n <th>514</th>\n <td>China</td>\n <td>Chongqing Univ</td>\n <td>478</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>476</th>\n <td>Bulgaria</td>\n <td>Tech Univ</td>\n <td>1</td>\n </tr>\n <tr>\n <th>12454</th>\n <td>Norway</td>\n <td>Stavanger Univ Hosp</td>\n <td>9</td>\n </tr>\n <tr>\n <th>5489</th>\n <td>China</td>\n <td>Shanghai Sports Sch</td>\n <td>1</td>\n </tr>\n <tr>\n <th>768</th>\n <td>China</td>\n <td>Hubei Univ</td>\n <td>25</td>\n </tr>\n <tr>\n <th>13527</th>\n <td>Spain</td>\n <td>Jimenez Diaz Univ Hosp</td>\n <td>2</td>\n </tr>\n </tbody>\n</table>\n<p>100 rows × 3 columns</p>\n</div>"
"text/plain": " Country Institution_harm count\n8168 Croatia Inst Adriat Crops & Karst Reclamat 1\n3417 China Ctr Eye & Vis Res 1\n1034 China Westlake Inst Adv Study 13\n13427 Italy Macerata Hosp 1\n8071 China Key Lab Ecoind Green Technol Fujian Prov 1\n... ... ... ...\n17230 United Kingdom Univ Kingston 6\n8847 France Univ Artois 8\n16071 Spain Catalonia Geriatr & Gerontol Soc 1\n6357 China Wuxi Huace Elect Syst Co Ltd 1\n9049 France Excelia Business Sch 3\n\n[100 rows x 3 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Country</th>\n <th>Institution_harm</th>\n <th>count</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>8168</th>\n <td>Croatia</td>\n <td>Inst Adriat Crops &amp; Karst Reclamat</td>\n <td>1</td>\n </tr>\n <tr>\n <th>3417</th>\n <td>China</td>\n <td>Ctr Eye &amp; Vis Res</td>\n <td>1</td>\n </tr>\n <tr>\n <th>1034</th>\n <td>China</td>\n <td>Westlake Inst Adv Study</td>\n <td>13</td>\n </tr>\n <tr>\n <th>13427</th>\n <td>Italy</td>\n <td>Macerata Hosp</td>\n <td>1</td>\n </tr>\n <tr>\n <th>8071</th>\n <td>China</td>\n <td>Key Lab Ecoind Green Technol Fujian Prov</td>\n <td>1</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>17230</th>\n <td>United Kingdom</td>\n <td>Univ Kingston</td>\n <td>6</td>\n </tr>\n <tr>\n <th>8847</th>\n <td>France</td>\n <td>Univ Artois</td>\n <td>8</td>\n </tr>\n <tr>\n <th>16071</th>\n <td>Spain</td>\n <td>Catalonia Geriatr &amp; Gerontol Soc</td>\n <td>1</td>\n </tr>\n <tr>\n <th>6357</th>\n <td>China</td>\n <td>Wuxi Huace Elect Syst Co Ltd</td>\n <td>1</td>\n </tr>\n <tr>\n <th>9049</th>\n <td>France</td>\n <td>Excelia Business Sch</td>\n <td>3</td>\n </tr>\n </tbody>\n</table>\n<p>100 rows × 3 columns</p>\n</div>"
},
"execution_count": 196,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@ -159,7 +159,7 @@
},
{
"cell_type": "code",
"execution_count": 197,
"execution_count": 7,
"outputs": [],
"source": [
"# from pandarallel import pandarallel\n",
@ -173,7 +173,7 @@
},
{
"cell_type": "code",
"execution_count": 198,
"execution_count": 8,
"outputs": [],
"source": [
"def ngrams(string, n=3):\n",
@ -276,13 +276,13 @@
},
{
"cell_type": "code",
"execution_count": 199,
"execution_count": 9,
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 31/31 [00:00<00:00, 32.89it/s]\n"
"100%|██████████| 31/31 [00:00<00:00, 31.97it/s]\n"
]
}
],
@ -334,20 +334,20 @@
},
{
"cell_type": "code",
"execution_count": 200,
"execution_count": 10,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1538\n"
"1916\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"1538it [01:04, 23.79it/s]\n"
"1916it [01:11, 26.94it/s]\n"
]
}
],
@ -370,18 +370,8 @@
},
{
"cell_type": "code",
"execution_count": 201,
"outputs": [
{
"data": {
"text/plain": " Country low_side \n0 China Logist Univ Chinese Peoples Armed Police Forces \\\n9 China Flight Automat Control Res Inst \n10 China Northwest Elect Power Design Inst Co Ltd \n11 China Northwest Elect Power Design Inst Co Ltd China \n12 China Northwest Inst Ecoenvironm & Resources \n... ... ... \n1531 China Chinese Univ Hong Kong Hong \n1532 China Huazhong Univ Sci & Techno \n1533 China Hong Kong Polytech Univ Hong Kong \n1534 China Kong Kong Polytech Univ \n1537 China Univ Elect Sci & Technol Chin \n\n high_side high_count \n0 Logist Univ Chinese Peoples Armed Police Force 1 \\\n9 Xian Flight Automat Control Res Inst 1 \n10 Northwest Elect Power Design Inst Co Ltd China 1 \n11 Northwest Elect Power Design Inst Co Ltd 1 \n12 Northwest Inst Ecoenvironm & Resources Chinese Ac 1 \n... ... ... \n1531 Chinese Univ Hong Kong 728 \n1532 Huazhong Univ Sci & Technol 729 \n1533 Hong Kong Polytech Univ 809 \n1534 Hong Kong Polytech Univ 809 \n1537 Univ Elect Sci & Technol China 1076 \n\n low_count similarity \n0 1 0.988072 \n9 1 0.905747 \n10 1 0.926984 \n11 1 0.926984 \n12 1 0.910630 \n... ... ... \n1531 1 0.935944 \n1532 1 0.989260 \n1533 1 0.917345 \n1534 1 0.939416 \n1537 1 0.983258 \n\n[346 rows x 6 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Country</th>\n <th>low_side</th>\n <th>high_side</th>\n <th>high_count</th>\n <th>low_count</th>\n <th>similarity</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>China</td>\n <td>Logist Univ Chinese Peoples Armed Police Forces</td>\n <td>Logist Univ Chinese Peoples Armed Police Force</td>\n <td>1</td>\n <td>1</td>\n <td>0.988072</td>\n </tr>\n <tr>\n <th>9</th>\n <td>China</td>\n <td>Flight Automat Control Res Inst</td>\n <td>Xian Flight Automat Control Res Inst</td>\n <td>1</td>\n <td>1</td>\n <td>0.905747</td>\n </tr>\n <tr>\n <th>10</th>\n <td>China</td>\n <td>Northwest Elect Power Design Inst Co Ltd</td>\n <td>Northwest Elect Power Design Inst Co Ltd China</td>\n <td>1</td>\n <td>1</td>\n <td>0.926984</td>\n </tr>\n <tr>\n <th>11</th>\n <td>China</td>\n <td>Northwest Elect Power Design Inst Co Ltd China</td>\n <td>Northwest Elect Power Design Inst Co Ltd</td>\n <td>1</td>\n <td>1</td>\n <td>0.926984</td>\n </tr>\n <tr>\n <th>12</th>\n <td>China</td>\n <td>Northwest Inst Ecoenvironm &amp; Resources</td>\n <td>Northwest Inst Ecoenvironm &amp; Resources Chinese Ac</td>\n <td>1</td>\n <td>1</td>\n <td>0.910630</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>1531</th>\n <td>China</td>\n <td>Chinese Univ Hong Kong Hong</td>\n <td>Chinese Univ Hong Kong</td>\n <td>728</td>\n <td>1</td>\n <td>0.935944</td>\n </tr>\n <tr>\n <th>1532</th>\n <td>China</td>\n <td>Huazhong Univ Sci &amp; Techno</td>\n <td>Huazhong Univ Sci &amp; Technol</td>\n <td>729</td>\n <td>1</td>\n <td>0.989260</td>\n </tr>\n <tr>\n <th>1533</th>\n <td>China</td>\n <td>Hong Kong Polytech Univ Hong Kong</td>\n <td>Hong Kong Polytech Univ</td>\n <td>809</td>\n <td>1</td>\n <td>0.917345</td>\n </tr>\n <tr>\n <th>1534</th>\n <td>China</td>\n <td>Kong Kong Polytech Univ</td>\n <td>Hong Kong Polytech Univ</td>\n <td>809</td>\n <td>1</td>\n <td>0.939416</td>\n </tr>\n <tr>\n <th>1537</th>\n <td>China</td>\n <td>Univ Elect Sci &amp; Technol Chin</td>\n <td>Univ Elect Sci &amp; Technol China</td>\n <td>1076</td>\n <td>1</td>\n <td>0.983258</td>\n </tr>\n </tbody>\n</table>\n<p>346 rows × 6 columns</p>\n</div>"
},
"execution_count": 201,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 11,
"outputs": [],
"source": [
"# fuzzymerger[fuzzymerger[\"Country\"]==\"China\"]"
],
@ -398,18 +388,8 @@
},
{
"cell_type": "code",
"execution_count": 202,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Institution Country \n244 WOS:000286472300003 Univ Trent Italy \\\n364 WOS:000287586100011 Univ Trent Italy \n410 WOS:000287939200011 Abdus Salam Int Ctr Theoret Phys Italy \n765 WOS:000290996200002 Univ Trent Italy \n907 WOS:000291698400013 INFN Sez Roma 1 Italy \n... ... ... ... \n153063 WOS:000900129900175 Univ Rome Campus Biomed Aquila Italy \n154775 WOS:000929737300001 Prevent & Res Inst Italy \n154813 WOS:000929737300001 Ist Super Sanit Italy \n154855 WOS:000933331200004 Univ Federio II Italy \n154857 WOS:000933331200004 INAF Osservatorio Astron Capodimonte Italy \n\n Institution_harm merge_iter \n244 Univ Trento 1 \n364 Univ Trento 1 \n410 Abdus Salaam Int Ctr Theoret Phys 1 \n765 Univ Trento 1 \n907 Sez Roma 1 \n... ... ... \n153063 Univ Rome Campus Biomed Aquila 2 \n154775 Prevent & Res Inst 2 \n154813 Ist Super Sanita 1 \n154855 Univ Federico 1 \n154857 Osserv Astron Capodimonte 1 \n\n[375 rows x 5 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Institution</th>\n <th>Country</th>\n <th>Institution_harm</th>\n <th>merge_iter</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>244</th>\n <td>WOS:000286472300003</td>\n <td>Univ Trent</td>\n <td>Italy</td>\n <td>Univ Trento</td>\n <td>1</td>\n </tr>\n <tr>\n <th>364</th>\n <td>WOS:000287586100011</td>\n <td>Univ Trent</td>\n <td>Italy</td>\n <td>Univ Trento</td>\n <td>1</td>\n </tr>\n <tr>\n <th>410</th>\n <td>WOS:000287939200011</td>\n <td>Abdus Salam Int Ctr Theoret Phys</td>\n <td>Italy</td>\n <td>Abdus Salaam Int Ctr Theoret Phys</td>\n <td>1</td>\n </tr>\n <tr>\n <th>765</th>\n <td>WOS:000290996200002</td>\n <td>Univ Trent</td>\n <td>Italy</td>\n <td>Univ Trento</td>\n <td>1</td>\n </tr>\n <tr>\n <th>907</th>\n <td>WOS:000291698400013</td>\n <td>INFN Sez Roma 1</td>\n <td>Italy</td>\n <td>Sez Roma</td>\n <td>1</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>153063</th>\n <td>WOS:000900129900175</td>\n <td>Univ Rome Campus Biomed Aquila</td>\n <td>Italy</td>\n <td>Univ Rome Campus Biomed Aquila</td>\n <td>2</td>\n </tr>\n <tr>\n <th>154775</th>\n <td>WOS:000929737300001</td>\n <td>Prevent &amp; Res Inst</td>\n <td>Italy</td>\n <td>Prevent &amp; Res Inst</td>\n <td>2</td>\n </tr>\n <tr>\n <th>154813</th>\n <td>WOS:000929737300001</td>\n <td>Ist Super Sanit</td>\n <td>Italy</td>\n <td>Ist Super Sanita</td>\n <td>1</td>\n </tr>\n <tr>\n <th>154855</th>\n <td>WOS:000933331200004</td>\n <td>Univ Federio II</td>\n <td>Italy</td>\n <td>Univ Federico</td>\n <td>1</td>\n </tr>\n <tr>\n <th>154857</th>\n <td>WOS:000933331200004</td>\n <td>INAF Osservatorio Astron Capodimonte</td>\n <td>Italy</td>\n <td>Osserv Astron Capodimonte</td>\n <td>1</td>\n </tr>\n </tbody>\n</table>\n<p>375 rows × 5 columns</p>\n</div>"
},
"execution_count": 202,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 12,
"outputs": [],
"source": [
"# univ_harm[((univ_harm[\"merge_iter\"]>0) & (univ_harm[\"Country\"]==\"Italy\"))]"
],
@ -419,7 +399,7 @@
},
{
"cell_type": "code",
"execution_count": 208,
"execution_count": 13,
"outputs": [],
"source": [
"univ_harm.loc[((univ_harm[\"merge_iter\"]>0) & (univ_harm[\"Country\"]==\"Italy\")&\n",
@ -432,13 +412,13 @@
},
{
"cell_type": "code",
"execution_count": 209,
"execution_count": 14,
"outputs": [
{
"data": {
"text/plain": "Institution 17083\nInstitution_harm 14449\ndtype: int64"
"text/plain": "Institution 19821\nInstitution_harm 16646\ndtype: int64"
},
"execution_count": 209,
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
@ -452,7 +432,28 @@
},
{
"cell_type": "code",
"execution_count": 210,
"execution_count": 16,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Institution Country \n49282 WOS:000428099700011 Univ Sheffield United Kingdom \\\n51975 WOS:000432981300002 Chinese Acad Sci China \n64618 WOS:000459693000011 Babes Bolyai Univ Romania \n163145 WOS:000828102100001 Xidian Univ China \n99690 WOS:000566510600001 Fora Forest Technol Spain \n... ... ... ... \n1567 WOS:000293492500004 Univ Essex United Kingdom \n73076 WOS:000476471800022 Shanghai Univ China \n137096 WOS:000715426400001 Queen Mary Hosp China \n164978 WOS:000836819000003 Manchester Metropolitan Univ United Kingdom \n32973 WOS:000390181300013 Univ Complutense Madrid Spain \n\n Institution_harm merge_iter \n49282 Univ Sheffield 0 \n51975 Chinese Acad Sci 0 \n64618 Babes Bolyai Univ 0 \n163145 Xidian Univ 0 \n99690 Fora Forest Technol 0 \n... ... ... \n1567 Univ Essex 0 \n73076 Shanghai Univ 0 \n137096 Queen Mary Hosp 0 \n164978 Manchester Metropolitan Univ 0 \n32973 Univ Complutense Madrid 0 \n\n[500 rows x 5 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Institution</th>\n <th>Country</th>\n <th>Institution_harm</th>\n <th>merge_iter</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>49282</th>\n <td>WOS:000428099700011</td>\n <td>Univ Sheffield</td>\n <td>United Kingdom</td>\n <td>Univ Sheffield</td>\n <td>0</td>\n </tr>\n <tr>\n <th>51975</th>\n <td>WOS:000432981300002</td>\n <td>Chinese Acad Sci</td>\n <td>China</td>\n <td>Chinese Acad Sci</td>\n <td>0</td>\n </tr>\n <tr>\n <th>64618</th>\n <td>WOS:000459693000011</td>\n <td>Babes Bolyai Univ</td>\n <td>Romania</td>\n <td>Babes Bolyai Univ</td>\n <td>0</td>\n </tr>\n <tr>\n <th>163145</th>\n <td>WOS:000828102100001</td>\n <td>Xidian Univ</td>\n <td>China</td>\n <td>Xidian Univ</td>\n <td>0</td>\n </tr>\n <tr>\n <th>99690</th>\n <td>WOS:000566510600001</td>\n <td>Fora Forest Technol</td>\n <td>Spain</td>\n <td>Fora Forest Technol</td>\n <td>0</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>1567</th>\n <td>WOS:000293492500004</td>\n <td>Univ Essex</td>\n <td>United Kingdom</td>\n <td>Univ Essex</td>\n <td>0</td>\n </tr>\n <tr>\n <th>73076</th>\n <td>WOS:000476471800022</td>\n <td>Shanghai Univ</td>\n <td>China</td>\n <td>Shanghai Univ</td>\n <td>0</td>\n </tr>\n <tr>\n <th>137096</th>\n <td>WOS:000715426400001</td>\n <td>Queen Mary Hosp</td>\n <td>China</td>\n <td>Queen Mary Hosp</td>\n <td>0</td>\n </tr>\n <tr>\n <th>164978</th>\n <td>WOS:000836819000003</td>\n <td>Manchester Metropolitan Univ</td>\n <td>United Kingdom</td>\n <td>Manchester Metropolitan Univ</td>\n <td>0</td>\n </tr>\n <tr>\n <th>32973</th>\n <td>WOS:000390181300013</td>\n <td>Univ Complutense Madrid</td>\n <td>Spain</td>\n <td>Univ Complutense Madrid</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n<p>500 rows × 5 columns</p>\n</div>"
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"univ_harm.sample(500)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 15,
"outputs": [],
"source": [
"univ_harm.to_excel(f\"{outdir}/wos_institution_locations_harmonized.xlsx\", index=False)"

Loading…
Cancel
Save