You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ZSI_Reconnect_China/WOS/wos_processing_pipeline.ipynb

1238 lines
100 KiB
Plaintext

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import os\n",
"import shutil\n",
"from flashgeotext.geotext import GeoText\n",
"import re"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [],
"source": [
"import hashlib\n",
"\n",
"def md5hash(s: str):\n",
" return hashlib.md5(s.encode('utf-8')).hexdigest()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"record_col=\"UT (Unique WOS ID)\"\n",
"outfile = r\"C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\WOS\\wos_extract\\wos_records_concat.csv\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": " Publication Type Authors \n29758 C Fu, YC; Liu, YH; Gao, ZW \\\n34098 J Han, D; Zhang, CH; Fauconnier, ML \n55478 C Xu, YX; Liu, M; Peng, L; Zhang, JQ; Zheng, YW \n32260 C Liu, Q; Cai, WD; Fu, ZJ; Shen, J; Linge, N \n8751 J Shamshirband, S; Nodoushan, EJ; Adolf, JE; Man... \n... ... ... \n6151 C Seufert, M; Casas, P; Wehner, N; Gang, L; Li, K \n32052 J Huber, A; Kinna, D; Huber, V; Arnoux, G; Balbo... \n27985 J Dong, GP; Ma, J; Kwan, MP; Wang, YM; Chai, YW \n2939 J Yin, ZY; Jin, YF; Huang, HW; Shen, SL \n34651 J Wang, JH; Lindenbergh, R; Menenti, M \n\n Book Authors Book Editors \n29758 NaN Yu, H \\\n34098 NaN NaN \n55478 NaN NaN \n32260 NaN Fang, WC; Vasilakos, T; Stoica, A; Kwak, YS \n8751 NaN NaN \n... ... ... \n6151 NaN Galis, A; Guillemin, F; Noldus, R; Secci, S; I... \n32052 NaN NaN \n27985 NaN NaN \n2939 NaN NaN \n34651 NaN NaN \n\n Book Group Authors Author Full Names \n29758 NaN Fu, Yichuan; Liu, Yuanhong; Gao, Zhiwei \\\n34098 NaN Han, Dong; Zhang, Chun-Hui; Fauconnier, Marie-... \n55478 IEEE Xu, Yuxuan; Liu, Ming; Peng, Linning; Zhang, J... \n32260 NaN Liu, Qi; Cai, Weidong; Fu, Zhangjie; Shen, Jia... \n8751 NaN Shamshirband, Shahaboddin; Nodoushan, Ehsan Ja... \n... ... ... \n6151 NaN Seufert, Michael; Casas, Pedro; Wehner, Nikola... \n32052 NaN Huber, A.; Kinna, D.; Huber, V.; Arnoux, G.; B... \n27985 NaN Dong, Guanpeng; Ma, Jing; Kwan, Mei-Po; Wang, ... \n2939 NaN Yin Zhen-Yu; Jin Yin-Fu; Huang Hong-Wei; Shen ... \n34651 NaN Wang, Jinhu; Lindenbergh, Roderik; Menenti, Ma... \n\n Book Author Full Names Group Authors \n29758 NaN NaN \\\n34098 NaN NaN \n55478 NaN NaN \n32260 NaN NaN \n8751 NaN NaN \n... ... ... \n6151 NaN NaN \n32052 NaN JET Contributors \n27985 NaN NaN \n2939 NaN NaN \n34651 NaN NaN \n\n Article Title \n29758 Multiple Actuator Fault Classification in Wind... \\\n34098 Effect of Seasoning Addition on Volatile Compo... \n55478 Colluding RF Fingerprint Impersonation Attack ... \n32260 An Optimized Strategy for Speculative Executio... \n8751 Ensemble models with uncertainty analysis for ... \n... ... \n6151 Stream-based Machine Learning for Real-time Qo... \n32052 The near infrared imaging system for the real-... \n27985 Multi-level temporal autoregressive modelling ... \n2939 Evolutionary polynomial regression based model... \n34651 SigVox - A 3D feature matching algorithm for a... \n\n Source Title ... \n29758 2019 25TH IEEE INTERNATIONAL CONFERENCE ON AUT... ... \\\n34098 FOODS ... \n55478 IEEE INTERNATIONAL CONFERENCE ON COMMUNICATION... ... \n32260 2015 9TH INTERNATIONAL CONFERENCE ON FUTURE GE... ... \n8751 ENGINEERING APPLICATIONS OF COMPUTATIONAL FLUI... ... \n... ... ... \n6151 PROCEEDINGS OF THE 2019 22ND CONFERENCE ON INN... ... \n32052 PHYSICA SCRIPTA ... \n27985 INTERNATIONAL JOURNAL OF GEOGRAPHICAL INFORMAT... ... \n2939 ENGINEERING GEOLOGY ... \n34651 ISPRS JOURNAL OF PHOTOGRAMMETRY AND REMOTE SEN... ... \n\n WoS Categories \n29758 Automation & Control Systems; Computer Science... \\\n34098 Food Science & Technology \n55478 Telecommunications \n32260 Computer Science, Hardware & Architecture \n8751 Engineering, Multidisciplinary; Engineering, M... \n... ... \n6151 Computer Science, Hardware & Architecture; Com... \n32052 Physics, Multidisciplinary \n27985 Computer Science, Information Systems; Geograp... \n2939 Engineering, Geological; Geosciences, Multidis... \n34651 Geography, Physical; Geosciences, Multidiscipl... \n\n Web of Science Index \n29758 Conference Proceedings Citation Index - Scienc... \\\n34098 Science Citation Index Expanded (SCI-EXPANDED) \n55478 Conference Proceedings Citation Index - Scienc... \n32260 Conference Proceedings Citation Index - Scienc... \n8751 Science Citation Index Expanded (SCI-EXPANDED) \n... ... \n6151 Conference Proceedings Citation Index - Scienc... \n32052 Science Citation Index Expanded (SCI-EXPANDED)... \n27985 Science Citation Index Expanded (SCI-EXPANDED)... \n2939 Science Citation Index Expanded (SCI-EXPANDED) \n34651 Science Citation Index Expanded (SCI-EXPANDED) \n\n Research Areas IDS Number \n29758 Automation & Control Systems; Computer Science BP9AN \\\n34098 Food Science & Technology PV8DT \n55478 Telecommunications BT9VG \n32260 Computer Science BF1GE \n8751 Engineering; Mechanics HE2WU \n... ... ... \n6151 Computer Science BM8PP \n32052 Physics FL3JX \n27985 Computer Science; Geography; Physical Geograph... GS7LK \n2939 Engineering; Geology DS2IG \n34651 Physical Geography; Geology; Remote Sensing; I... EX2BV \n\n Pubmed Id Open Access Designations Highly Cited Status \n29758 NaN NaN NaN \\\n34098 33406625.0 gold, Green Published NaN \n55478 NaN NaN NaN \n32260 NaN NaN NaN \n8751 NaN Green Published, gold Y \n... ... ... ... \n6151 NaN NaN NaN \n32052 NaN NaN NaN \n27985 NaN hybrid, Green Published NaN \n2939 NaN NaN NaN \n34651 NaN NaN NaN \n\n Hot Paper Status Date of Export UT (Unique WOS ID) \n29758 NaN 2023-04-28 WOS:000568623100060 \n34098 NaN 2023-04-28 WOS:000610212800001 \n55478 NaN 2023-04-28 WOS:000864709903078 \n32260 NaN 2023-04-28 WOS:000380393500003 \n8751 N 2023-04-28 WOS:000453212200001 \n... ... ... ... \n6151 NaN 2023-04-28 WOS:000469794500014 \n32052 NaN 2023-04-28 WOS:000414120500027 \n27985 NaN 2023-04-28 WOS:000443882300004 \n2939 NaN 2023-04-28 WOS:000380592100015 \n34651 NaN 2023-04-28 WOS:000403031400010 \n\n[100 rows x 71 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Publication Type</th>\n <th>Authors</th>\n <th>Book Authors</th>\n <th>Book Editors</th>\n <th>Book Group Authors</th>\n <th>Author Full Names</th>\n <th>Book Author Full Names</th>\n <th>Group Authors</th>\n <th>Article Title</th>\n <th>Source Title</th>\n <th>...</th>\n <th>WoS Categories</th>\n <th>Web of Science Index</th>\n <th>Research Areas</th>\n <th>IDS Number</th>\n <th>Pubmed Id</th>\n <th>Open Access Designations</th>\n <th>Highly Cited Status</th>\n <th>Hot Paper Status</th>\n <th>Date of Export</th>\n <th>UT (Unique WOS ID)</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>29758</th>\n <td>C</td>\n <td>Fu, YC; Liu, YH; Gao, ZW</td>\n <td>NaN</td>\n <td>Yu, H</td>\n <td>NaN</td>\n <td>Fu, Yichuan; Liu, Yuanhong; Gao, Zhiwei</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Multiple Actuator Fault Classification in Wind...</td>\n <td>2019 25TH IEEE INTERNATIONAL CONFERENCE ON AUT...</td>\n <td>...</td>\n <td>Automation &amp; Control Systems; Computer Science...</td>\n <td>Conference Proceedings Citation Index - Scienc...</td>\n <td>Automation &amp; Control Systems; Computer Science</td>\n <td>BP9AN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>2023-04-28</td>\n <td>WOS:000568623100060</td>\n </tr>\n <tr>\n <th>34098</th>\n <td>J</td>\n <td>Han, D; Zhang, CH; Fauconnier, ML</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Han, Dong; Zhang, Chun-Hui; Fauconnier, Marie-...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Effect of Seasoning Addition on Volatile Compo...</td>\n <td>FOODS</td>\n <td>...</td>\n <td>Food Science &amp; Technology</td>\n <td>Science Citation Index Expanded (SCI-EXPANDED)</td>\n <td>Food Science &amp; Technology</td>\n <td>PV8DT</td>\n <td>33406625.0</td>\n <td>gold, Green Published</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>2023-04-28</td>\n <td>WOS:000610212800001</td>\n </tr>\n <tr>\n <th>55478</th>\n <td>C</td>\n <td>Xu, YX; Liu, M; Peng, L; Zhang, JQ; Zheng, YW</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>IEEE</td>\n <td>Xu, Yuxuan; Liu, Ming; Peng, Linning; Zhang, J...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Colluding RF Fingerprint Impersonation Attack ...</td>\n <td>IEEE INTERNATIONAL CONFERENCE ON COMMUNICATION...</td>\n <td>...</td>\n <td>Telecommunications</td>\n <td>Conference Proceedings Citation Index - Scienc...</td>\n <td>Telecommunications</td>\n <td>BT9VG</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>2023-04-28</td>\n <td>WOS:000864709903078</td>\n </tr>\n <tr>\n <th>32260</th>\n <td>C</td>\n <td>Liu, Q; Cai, WD; Fu, ZJ; Shen, J; Linge, N</td>\n <td>NaN</td>\n <td>Fang, WC; Vasilakos, T; Stoica, A; Kwak, YS</td>\n <td>NaN</td>\n <td>Liu, Qi; Cai, Weidong; Fu, Zhangjie; Shen, Jia...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>An Optimized Strategy for Speculative Executio...</td>\n <td>2015 9TH INTERNATIONAL CONFERENCE ON FUTURE GE...</td>\n <td>...</td>\n <td>Computer Science, Hardware &amp; Architecture</td>\n <td>Conference Proceedings Citation Index - Scienc...</td>\n <td>Computer Science</td>\n <td>BF1GE</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>2023-04-28</td>\n <td>WOS:000380393500003</td>\n </tr>\n <tr>\n <th>8751</th>\n <td>J</td>\n <td>Shamshirband, S; Nodoushan, EJ; Adolf, JE; Man...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Shamshirband, Shahaboddin; Nodoushan, Ehsan Ja...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Ensemble models with uncertainty analysis for ...</td>\n <td>ENGINEERING APPLICATIONS OF COMPUTATIONAL FLUI...</td>\n <td>...</td>\n <td>Engineering, Multidisciplinary; Engineering, M...</td>\n <td>Science Citation Index Expanded (SCI-EXPANDED)</td>\n <td>Engineering; Mechanics</td>\n <td>HE2WU</td>\n <td>NaN</td>\n <td>Green Published, gold</td>\n <td>Y</td>\n <td>N</td>\n <td>2023-04-28</td>\n <td>WOS:000453212200001</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>6151</th>\n <td>C</td>\n <td>Seufert, M; Casas, P; Wehner, N; Gang, L; Li, K</td>\n <td>NaN</td>\n <td>Galis, A; Guillemin, F; Noldus, R; Secci, S; I...</td>\n <td>NaN</td>\n <td>Seufert, Michael; Casas, Pedro; Wehner, Nikola...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Stream-based Machine Learning for Real-time Qo...</td>\n <td>PROCEEDINGS OF THE 2019 22ND CONFERENCE ON INN...</td>\n <td>...</td>\n <td>Computer Science, Hardware &amp; Architecture; Com...</td>\n <td>Conference Proceedings Citation Index - Scienc...</td>\n <td>Computer Science</td>\n <td>BM8PP</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>2023-04-28</td>\n <td>WOS:000469794500014</td>\n </tr>\n <tr>\n <th>32052</th>\n <td>J</td>\n <td>Huber, A; Kinna, D; Huber, V; Arnoux, G; Balbo...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Huber, A.; Kinna, D.; Huber, V.; Arnoux, G.; B...</td>\n <td>NaN</td>\n <td>JET Contributors</td>\n <td>The near infrared imaging system for the real-...</td>\n <td>PHYSICA SCRIPTA</td>\n <td>...</td>\n <td>Physics, Multidisciplinary</td>\n <td>Science Citation Index Expanded (SCI-EXPANDED)...</td>\n <td>Physics</td>\n <td>FL3JX</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>2023-04-28</td>\n <td>WOS:000414120500027</td>\n </tr>\n <tr>\n <th>27985</th>\n <td>J</td>\n <td>Dong, GP; Ma, J; Kwan, MP; Wang, YM; Chai, YW</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Dong, Guanpeng; Ma, Jing; Kwan, Mei-Po; Wang, ...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Multi-level temporal autoregressive modelling ...</td>\n <td>INTERNATIONAL JOURNAL OF GEOGRAPHICAL INFORMAT...</td>\n <td>...</td>\n <td>Computer Science, Information Systems; Geograp...</td>\n <td>Science Citation Index Expanded (SCI-EXPANDED)...</td>\n <td>Computer Science; Geography; Physical Geograph...</td>\n <td>GS7LK</td>\n <td>NaN</td>\n <td>hybrid, Green Published</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>2023-04-28</td>\n <td>WOS:000443882300004</td>\n </tr>\n <tr>\n <th>2939</th>\n <td>J</td>\n <td>Yin, ZY; Jin, YF; Huang, HW; Shen, SL</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Yin Zhen-Yu; Jin Yin-Fu; Huang Hong-Wei; Shen ...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Evolutionary polynomial regression based model...</td>\n <td>ENGINEERING GEOLOGY</td>\n <td>...</td>\n <td>Engineering, Geological; Geosciences, Multidis...</td>\n <td>Science Citation Index Expanded (SCI-EXPANDED)</td>\n <td>Engineering; Geology</td>\n <td>DS2IG</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>2023-04-28</td>\n <td>WOS:000380592100015</td>\n </tr>\n <tr>\n <th>34651</th>\n <td>J</td>\n <td>Wang, JH; Lindenbergh, R; Menenti, M</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Wang, Jinhu; Lindenbergh, Roderik; Menenti, Ma...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>SigVox - A 3D feature matching algorithm for a...</td>\n <td>ISPRS JOURNAL OF PHOTOGRAMMETRY AND REMOTE SEN...</td>\n <td>...</td>\n <td>Geography, Physical; Geosciences, Multidiscipl...</td>\n <td>Science Citation Index Expanded (SCI-EXPANDED)</td>\n <td>Physical Geography; Geology; Remote Sensing; I...</td>\n <td>EX2BV</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>2023-04-28</td>\n <td>WOS:000403031400010</td>\n </tr>\n </tbody>\n</table>\n<p>100 rows × 71 columns</p>\n</div>"
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos = pd.read_csv(outfile, sep=\"\\t\",low_memory=False)\n",
"wos.sample(100)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of initial (valid interval) records: 56196\n"
]
},
{
"data": {
"text/plain": " Domain_English Field_English \n0 Applied Sciences Agriculture, Fisheries & Forestry \\\n1 Applied Sciences Agriculture, Fisheries & Forestry \n2 Applied Sciences Agriculture, Fisheries & Forestry \n3 Applied Sciences Agriculture, Fisheries & Forestry \n4 Applied Sciences Agriculture, Fisheries & Forestry \n\n SubField_English 2.00 SEQ Source_title srcid \n0 Agronomy & Agriculture 1 Annals of Biology 13016 \\\n1 Agronomy & Agriculture 1 Advances in Agronomy 14324 \n2 Agronomy & Agriculture 1 European Journal of Soil Biology 14648 \n3 Agronomy & Agriculture 1 Soil Biology and Biochemistry 14802 \n4 Agronomy & Agriculture 1 Agricultura Tecnica 14972 \n\n issn_type issn \n0 issn1 09700153 \n1 issn1 00652113 \n2 issn1 11645563 \n3 issn1 00380717 \n4 issn1 03652807 ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Domain_English</th>\n <th>Field_English</th>\n <th>SubField_English</th>\n <th>2.00 SEQ</th>\n <th>Source_title</th>\n <th>srcid</th>\n <th>issn_type</th>\n <th>issn</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Applied Sciences</td>\n <td>Agriculture, Fisheries &amp; Forestry</td>\n <td>Agronomy &amp; Agriculture</td>\n <td>1</td>\n <td>Annals of Biology</td>\n <td>13016</td>\n <td>issn1</td>\n <td>09700153</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Applied Sciences</td>\n <td>Agriculture, Fisheries &amp; Forestry</td>\n <td>Agronomy &amp; Agriculture</td>\n <td>1</td>\n <td>Advances in Agronomy</td>\n <td>14324</td>\n <td>issn1</td>\n <td>00652113</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Applied Sciences</td>\n <td>Agriculture, Fisheries &amp; Forestry</td>\n <td>Agronomy &amp; Agriculture</td>\n <td>1</td>\n <td>European Journal of Soil Biology</td>\n <td>14648</td>\n <td>issn1</td>\n <td>11645563</td>\n </tr>\n <tr>\n <th>3</th>\n <td>Applied Sciences</td>\n <td>Agriculture, Fisheries &amp; Forestry</td>\n <td>Agronomy &amp; Agriculture</td>\n <td>1</td>\n <td>Soil Biology and Biochemistry</td>\n <td>14802</td>\n <td>issn1</td>\n <td>00380717</td>\n </tr>\n <tr>\n <th>4</th>\n <td>Applied Sciences</td>\n <td>Agriculture, Fisheries &amp; Forestry</td>\n <td>Agronomy &amp; Agriculture</td>\n <td>1</td>\n <td>Agricultura Tecnica</td>\n <td>14972</td>\n <td>issn1</td>\n <td>03652807</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"wos = wos[((wos[\"Publication Year\"]<2023)&(wos[\"Publication Year\"]>2010))].copy()\n",
"print(f'Number of initial (valid interval) records: {len(wos)}')\n",
"\n",
"metrix = pd.read_excel(\"sm_journal_classification.xlsx\", sheet_name=\"Journal_Classification\")\n",
"\n",
"\n",
"metrix = metrix.set_index([c for c in metrix.columns if \"issn\" not in c]).stack().reset_index()\n",
"metrix = metrix.rename(columns={'level_6':\"issn_type\", 0:\"issn\"})\n",
"metrix[\"issn\"]=metrix[\"issn\"].str.replace(\"-\",\"\").str.lower().str.strip()\n",
"metrix.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 6,
"outputs": [
{
"data": {
"text/plain": "Domain_English 6\nField_English 21\nSubField_English 175\ndtype: int64"
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"metrix[[\"Domain_English\",\"Field_English\",\"SubField_English\"]].nunique()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 4,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of initial (valid interval) records: 56196\n",
"Number of METRIX filtered records: 49854\n",
"Number of unindexed records: 2984\n",
"Number of filtered records (dropping duplicates): 49839\n"
]
}
],
"source": [
"\n",
"wos[\"issn\"] = wos[\"ISSN\"].str.replace(\"-\",\"\").str.lower().str.strip()\n",
"wos[\"eissn\"] = wos[\"eISSN\"].str.replace(\"-\",\"\").str.lower().str.strip()\n",
"wos = wos.set_index([c for c in wos.columns if \"issn\" not in c]).stack().reset_index()\n",
"wos = wos.rename(columns={'level_71':\"issn_var\", 0:\"issn\"})\n",
"\n",
"wos_merge = wos.merge(metrix, on=\"issn\", how=\"left\")\n",
"\n",
"\n",
"\n",
"wos_indexed = wos_merge[~wos_merge[\"Domain_English\"].isna()]\n",
"wos_unindexed = wos_merge[~wos_merge[record_col].isin(wos_indexed[record_col])]\n",
"\n",
"\n",
"wos_unindexed = wos_unindexed.sort_values(by=[\"issn_var\"],ascending=False).drop_duplicates(subset=record_col)\n",
"wos = wos_indexed.sort_values(by=[\"issn_var\"],ascending=False).drop_duplicates(subset=record_col)\n",
"\n",
"wos_postmerge = wos.copy()\n",
"print(f'Number of METRIX filtered records: {len(wos)}')\n",
"print(f'Number of unindexed records: {len(wos_unindexed)}')\n",
"\n",
"# drop entries not indexed by metrix\n",
"# drop duplicates (based on doi)\n",
"wos = wos[~((~wos[\"DOI\"].isna())&(wos[\"DOI\"].duplicated(False)))]\n",
"wos = wos.drop_duplicates(subset=[\"Publication Type\",\"Document Type\",\"Authors\",\"Article Title\",\"Source Title\",\"Publication Year\"])\n",
"print(f'Number of filtered records (dropping duplicates): {len(wos)}')"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 5,
"outputs": [
{
"data": {
"text/plain": "Domain_English\nApplied Sciences 31871\nNatural Sciences 9542\nHealth Sciences 5942\nEconomic & Social Sciences 1468\narticle-level classification 940\nArts & Humanities 76\nName: count, dtype: int64"
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos[\"Domain_English\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 6,
"outputs": [],
"source": [
"wos_classifier = wos[[\"WoS Categories\",\"Research Areas\"]+list(metrix.columns)].copy().drop_duplicates()\n",
"wos_classifier = wos_classifier.groupby([\"WoS Categories\",\"Research Areas\"], as_index=False)[[\"Domain_English\",\"Field_English\",\"SubField_English\"]].agg(\n",
" lambda x: pd.Series.mode(x)[0])"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 7,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found: 2065 \n",
"Lost forever: 919\n"
]
}
],
"source": [
"wos_to_reindex = wos_unindexed.drop(columns=list(metrix.columns))\n",
"wos_found = wos_to_reindex.merge(wos_classifier, on=[\"WoS Categories\",\"Research Areas\"], how=\"inner\")\n",
"# wos_found = wos_to_reindex.merge(wos_classifier, on=\"Research Areas\", how=\"inner\")\n",
"# # wos_found = wos_to_reindex.merge(wos_classifier, on=\"WoS Categories\", how=\"inner\")\n",
"wos_stillost = wos_unindexed[~wos_unindexed[record_col].isin(wos_found[record_col])]\n",
"\n",
"print(\"Found:\", wos_found[record_col].nunique(),\"\\nLost forever:\", wos_stillost[record_col].nunique())"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 8,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of records (after remerge): 51904\n"
]
}
],
"source": [
"wos = pd.concat([wos,wos_found], ignore_index=True)\n",
"print(f'Number of records (after remerge): {len(wos)}')"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 9,
"outputs": [
{
"data": {
"text/plain": "Domain_English\nApplied Sciences 33720\nNatural Sciences 9617\nHealth Sciences 6002\nEconomic & Social Sciences 1533\narticle-level classification 955\nArts & Humanities 77\nName: count, dtype: int64"
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos[\"Domain_English\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 10,
"outputs": [
{
"data": {
"text/plain": "WoS Categories\nEngineering, Electrical & Electronic 13661\nComputer Science, Artificial Intelligence 7760\nComputer Science, Information Systems 6481\nTelecommunications 5560\nComputer Science, Theory & Methods 3597\n ... \nMusic 1\nCultural Studies 1\nPsychology, Psychoanalysis 1\nAsian Studies 1\nAndrology 1\nName: count, Length: 236, dtype: int64"
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos_cat = wos.groupby(record_col)[\"WoS Categories\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
"wos_cat[\"WoS Categories\"] = wos_cat[\"WoS Categories\"].str.strip()\n",
"wos_cat[\"WoS Categories\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 11,
"outputs": [
{
"data": {
"text/plain": "WoS Category\nEngineering 20126\nComputer Science 17613\nTelecommunications 5560\nImaging Science & Photographic Technology 3295\nAutomation & Control Systems 3232\n ... \nMusic 1\nAndrology 1\nLiterature 1\nCultural Studies 1\nAsian Studies 1\nName: count, Length: 177, dtype: int64"
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos_subcat = wos_cat.copy()\n",
"wos_subcat[['WoS Category', 'WoS SubCategory']] = wos_subcat[\"WoS Categories\"].str.split(\",\", expand = True, n=1)\n",
"for c in ['WoS Category', 'WoS SubCategory',\"WoS Categories\"]:\n",
" wos_subcat[c] = wos_subcat[c].str.strip()\n",
"wos_subcat.drop_duplicates(subset=[record_col,'WoS Category'])[\"WoS Category\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [
{
"data": {
"text/plain": "Research Areas\nEngineering 20176\nComputer Science 17613\nTelecommunications 5560\nEnvironmental Sciences & Ecology 3732\nImaging Science & Photographic Technology 3295\n ... \nLiterature 1\nWomen's Studies 1\nCultural Studies 1\nAsian Studies 1\nMusic 1\nName: count, Length: 147, dtype: int64"
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos_areas = wos.groupby(record_col)[\"Research Areas\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
"wos_areas[\"Research Areas\"] = wos_areas[\"Research Areas\"].str.strip()\n",
"wos_areas[\"Research Areas\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 13,
"outputs": [
{
"data": {
"text/plain": " Article Title \n24862 Kinematic self-calibration of non-contact five... \\\n6623 Optimizing Color Assignment for Perception of ... \n20728 CFD modeling of biomass combustion and gasific... \n41245 Redshift-space distortions in f(R) gravity \n12373 Executable Knowledge Graphs for Machine Learni... \n... ... \n11117 Biochar amendment mitigated N2O emissions from... \n47975 Adaptive Noise Reduction for Sound Event Detec... \n4599 NVM Storage in IoT Devices: Opportunities and ... \n40609 FABNet: Fusion Attention Block and Transfer Le... \n45199 Tea Category Identification Using a Novel Frac... \n\n Keywords Plus \n24862 POSE MEASUREMENT; PARALLEL; MANIPULATOR \\\n6623 OPTIMIZATION; DIFFERENCE \n20728 DISCRETE PARTICLE SIMULATION; STEAM GASIFICATI... \n41245 DARK ENERGY SURVEY; REAL-SPACE; GROWTH; DENSIT... \n12373 NaN \n... ... \n11117 NITROUS-OXIDE EMISSIONS; NITRIFIER DENITRIFICA... \n47975 NONNEGATIVE MATRIX FACTORIZATION; SOURCE SEPAR... \n4599 ENCRYPTED DATA; ACCESS-CONTROL; INTERNET; MEMO... \n40609 NUCLEI \n45199 LEARNING-BASED OPTIMIZATION; PATHOLOGICAL BRAI... \n\n Author Keywords \n24862 kinematic self-calibration; five-axis measurin... \n6623 Color perception; visual design; scatterplots \n20728 Biomass combustion and gasification; CFD simul... \n41245 cosmology: theory; dark energy; large-scale st... \n12373 Knowledge graph; Machine learning; Data analyt... \n... ... \n11117 Biochar; Nitrite accumulation; Nitrous oxide; ... \n47975 sound event detection; non-stationary noise; w... \n4599 IoT; NVM; storage system; energy efficiency; s... \n40609 Cancer; Analytical models; Transfer learning; ... \n45199 tea-category identification; fractional Fourie... \n\n[100 rows x 3 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Article Title</th>\n <th>Keywords Plus</th>\n <th>Author Keywords</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>24862</th>\n <td>Kinematic self-calibration of non-contact five...</td>\n <td>POSE MEASUREMENT; PARALLEL; MANIPULATOR</td>\n <td>kinematic self-calibration; five-axis measurin...</td>\n </tr>\n <tr>\n <th>6623</th>\n <td>Optimizing Color Assignment for Perception of ...</td>\n <td>OPTIMIZATION; DIFFERENCE</td>\n <td>Color perception; visual design; scatterplots</td>\n </tr>\n <tr>\n <th>20728</th>\n <td>CFD modeling of biomass combustion and gasific...</td>\n <td>DISCRETE PARTICLE SIMULATION; STEAM GASIFICATI...</td>\n <td>Biomass combustion and gasification; CFD simul...</td>\n </tr>\n <tr>\n <th>41245</th>\n <td>Redshift-space distortions in f(R) gravity</td>\n <td>DARK ENERGY SURVEY; REAL-SPACE; GROWTH; DENSIT...</td>\n <td>cosmology: theory; dark energy; large-scale st...</td>\n </tr>\n <tr>\n <th>12373</th>\n <td>Executable Knowledge Graphs for Machine Learni...</td>\n <td>NaN</td>\n <td>Knowledge graph; Machine learning; Data analyt...</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>11117</th>\n <td>Biochar amendment mitigated N2O emissions from...</td>\n <td>NITROUS-OXIDE EMISSIONS; NITRIFIER DENITRIFICA...</td>\n <td>Biochar; Nitrite accumulation; Nitrous oxide; ...</td>\n </tr>\n <tr>\n <th>47975</th>\n <td>Adaptive Noise Reduction for Sound Event Detec...</td>\n <td>NONNEGATIVE MATRIX FACTORIZATION; SOURCE SEPAR...</td>\n <td>sound event detection; non-stationary noise; w...</td>\n </tr>\n <tr>\n <th>4599</th>\n <td>NVM Storage in IoT Devices: Opportunities and ...</td>\n <td>ENCRYPTED DATA; ACCESS-CONTROL; INTERNET; MEMO...</td>\n <td>IoT; NVM; storage system; energy efficiency; s...</td>\n </tr>\n <tr>\n <th>40609</th>\n <td>FABNet: Fusion Attention Block and Transfer Le...</td>\n <td>NUCLEI</td>\n <td>Cancer; Analytical models; Transfer learning; ...</td>\n </tr>\n <tr>\n <th>45199</th>\n <td>Tea Category Identification Using a Novel Frac...</td>\n <td>LEARNING-BASED OPTIMIZATION; PATHOLOGICAL BRAI...</td>\n <td>tea-category identification; fractional Fourie...</td>\n </tr>\n </tbody>\n</table>\n<p>100 rows × 3 columns</p>\n</div>"
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos[[\"Article Title\",\"Keywords Plus\",\"Author Keywords\"]].sample(100)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 14,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) keyword_all\n0 WOS:000208837000001 NANOINDENTATION\n1 WOS:000208837000001 HARDNESS\n2 WOS:000208837000001 PLASMA-SPRAYED COATING\n3 WOS:000208837000001 INVERSE ANALYSIS\n4 WOS:000208837000001 NUMERICAL METHOD\n.. ... ...\n97 WOS:000209571700012 PERSONALIZED MEDICINE\n98 WOS:000209571700012 COMPLEX NETWORK\n99 WOS:000209571700012 CLINICAL PHENOTYPE NETWORK\n100 WOS:000209571700012 TRADITIONAL CHINESE MEDICINE\n101 WOS:000209617200002 PHYLLOSCOPIDAE\n\n[100 rows x 2 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>keyword_all</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000208837000001</td>\n <td>NANOINDENTATION</td>\n </tr>\n <tr>\n <th>1</th>\n <td>WOS:000208837000001</td>\n <td>HARDNESS</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000208837000001</td>\n <td>PLASMA-SPRAYED COATING</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000208837000001</td>\n <td>INVERSE ANALYSIS</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000208837000001</td>\n <td>NUMERICAL METHOD</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>97</th>\n <td>WOS:000209571700012</td>\n <td>PERSONALIZED MEDICINE</td>\n </tr>\n <tr>\n <th>98</th>\n <td>WOS:000209571700012</td>\n <td>COMPLEX NETWORK</td>\n </tr>\n <tr>\n <th>99</th>\n <td>WOS:000209571700012</td>\n <td>CLINICAL PHENOTYPE NETWORK</td>\n </tr>\n <tr>\n <th>100</th>\n <td>WOS:000209571700012</td>\n <td>TRADITIONAL CHINESE MEDICINE</td>\n </tr>\n <tr>\n <th>101</th>\n <td>WOS:000209617200002</td>\n <td>PHYLLOSCOPIDAE</td>\n </tr>\n </tbody>\n</table>\n<p>100 rows × 2 columns</p>\n</div>"
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kw_df = pd.DataFrame()\n",
"for c in [\"Keywords Plus\",\"Author Keywords\"]:\n",
" kwp = wos.groupby(record_col)[c].apply(lambda x: x.str.split(';')).explode().str.strip().str.upper()\n",
" kwp.name = 'keyword_all'\n",
" kw_df = pd.concat([kwp.reset_index(),kw_df],ignore_index=True)\n",
"kw_df = kw_df[~kw_df[\"keyword_all\"].isna()].copy().drop(columns=\"level_1\").drop_duplicates()\n",
"kw_df[\"keyword_all\"] = kw_df[\"keyword_all\"].apply(lambda x: re.sub(\"[\\(\\[].*?[\\)\\]]\", \"\", x))\n",
"kw_df.head(100)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 15,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) keyword_all\n0 WOS:000208837000001 NANOINDENTATION; HARDNESS; PLASMA-SPRAYED COAT...\n1 WOS:000208863600013 COMPARATIVE GENOMICS; ANAMMOX; KUENENIA STUTTG...\n2 WOS:000208863600266 ANME; PYROSEQUENCING; AOM; COMMUNITY STRUCTURE...\n3 WOS:000208863900217 DEFAULT MODE NETWORK; EFFECTIVE CONNECTIVITY; ...\n4 WOS:000208935500007 ESTIMATION; SOURCE TERM; QUASI STEADY; THE ITE...",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>keyword_all</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000208837000001</td>\n <td>NANOINDENTATION; HARDNESS; PLASMA-SPRAYED COAT...</td>\n </tr>\n <tr>\n <th>1</th>\n <td>WOS:000208863600013</td>\n <td>COMPARATIVE GENOMICS; ANAMMOX; KUENENIA STUTTG...</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000208863600266</td>\n <td>ANME; PYROSEQUENCING; AOM; COMMUNITY STRUCTURE...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000208863900217</td>\n <td>DEFAULT MODE NETWORK; EFFECTIVE CONNECTIVITY; ...</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000208935500007</td>\n <td>ESTIMATION; SOURCE TERM; QUASI STEADY; THE ITE...</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos_kwd_concat = kw_df.groupby(record_col, as_index=False).agg({'keyword_all': '; '.join})\n",
"wos_kwd_concat.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 15,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 16,
"outputs": [
{
"data": {
"text/plain": "Index(['Publication Type', 'Authors', 'Book Authors', 'Book Editors',\n 'Book Group Authors', 'Author Full Names', 'Book Author Full Names',\n 'Group Authors', 'Article Title', 'Source Title', 'Book Series Title',\n 'Book Series Subtitle', 'Language', 'Document Type', 'Conference Title',\n 'Conference Date', 'Conference Location', 'Conference Sponsor',\n 'Conference Host', 'Author Keywords', 'Keywords Plus', 'Abstract',\n 'Addresses', 'Affiliations', 'Reprint Addresses', 'Email Addresses',\n 'Researcher Ids', 'ORCIDs', 'Funding Orgs', 'Funding Name Preferred',\n 'Funding Text', 'Cited References', 'Cited Reference Count',\n 'Times Cited, WoS Core', 'Times Cited, All Databases',\n '180 Day Usage Count', 'Since 2013 Usage Count', 'Publisher',\n 'Publisher City', 'Publisher Address', 'ISSN', 'eISSN', 'ISBN',\n 'Journal Abbreviation', 'Journal ISO Abbreviation', 'Publication Date',\n 'Publication Year', 'Volume', 'Issue', 'Part Number', 'Supplement',\n 'Special Issue', 'Meeting Abstract', 'Start Page', 'End Page',\n 'Article Number', 'DOI', 'DOI Link', 'Book DOI', 'Early Access Date',\n 'Number of Pages', 'WoS Categories', 'Web of Science Index',\n 'Research Areas', 'IDS Number', 'Pubmed Id', 'Open Access Designations',\n 'Highly Cited Status', 'Hot Paper Status', 'Date of Export',\n 'UT (Unique WOS ID)', 'issn_var', 'issn', 'Domain_English',\n 'Field_English', 'SubField_English', '2.00 SEQ', 'Source_title',\n 'srcid', 'issn_type'],\n dtype='object')"
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos.columns"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"geotext = GeoText()\n",
"\n",
"def extract_location(input_text, key='countries'):\n",
" anomalies = {\"Malta\":\"Malta\",\n",
" \"Mongolia\":\"Mongolia\",\n",
" \"Quatar\":\"Qatar\",\n",
" \"Qatar\":\"Qatar\",\n",
" \"Ethiop\":\"Ethiopia\",\n",
" \"Nigeria\":\"Nigeria\",\n",
" \"BELAR\":\"Belarus\",\n",
" \"Venezuela\":\"Venezuela\",\n",
" \"Cyprus\":\"Cyprus\",\n",
" \"Ecuador\":\"Ecuador\",\n",
" \"U Arab\":\"United Arab Emirates\",\n",
" \"Syria\":\"Syria\",\n",
" \"Uganda\":\"Uganda\",\n",
" \"Yemen\":\"Yemen\",\n",
" \"Mali\":\"Mali\",\n",
" \"Senegal\":\"Senegal\",\n",
" \"Vatican\":\"Vatican\",\n",
" \"Uruguay\":\"Uruguay\",\n",
" \"Panama\":\"Panama\",\n",
" \"Fiji\":\"Fiji\",\n",
" \"Faroe\":\"Faroe Islands\",\n",
" \"Macedonia\":\"Macedonia\",\n",
" 'Mozambique':'Mozambique',\n",
" \"Kuwait\":\"Kuwait\",\n",
" \"Libya\":\"Libya\",\n",
" \"Turkiy\":\"Turkey\",\n",
" \"Liberia\":\"Liberia\",\n",
" \"Namibia\":\"Namibia\",\n",
" \"Ivoire\":\"Ivory Coast\",\n",
" \"Guatemala\":\"Gutemala\",\n",
" \"Paraguay\":\"Paraguay\",\n",
" \"Honduras\":\"Honduras\",\n",
" \"Nicaragua\":\"Nicaragua\",\n",
" \"Trinidad\":\"Trinidad & Tobago\",\n",
" \"Liechtenstein\":\"Liechtenstein\",\n",
" \"Greenland\":\"Denmark\"}\n",
"\n",
" extracted = geotext.extract(input_text=input_text)\n",
" found = extracted[key].keys()\n",
" if len(sorted(found))>0:\n",
" return sorted(found)[0]\n",
" elif key=='countries':\n",
" for i in ['Scotland','Wales','England', 'N Ireland']:\n",
" if i in input_text:\n",
" return 'United Kingdom'\n",
" for j in anomalies.keys():\n",
" if j in input_text:\n",
" return anomalies.get(j)\n",
" else:\n",
" return None\n",
"\n",
"with open('../eu_members.txt',\"r\") as f:\n",
" eu_countries=f.readline().split(\",\")\n",
" eu_countries=[i.strip() for i in eu_countries]\n",
"\n",
"def country_cleanup(country):\n",
" if \"USA\" in country:\n",
" return \"USA\"\n",
" elif \"China\" in country:\n",
" return \"China\"\n",
" elif country in [\"England\", \"Northern Ireland\", \"Wales\", \"Scotland\",\"N Ireland\"]:\n",
" return \"United Kingdom\"\n",
" else:\n",
" return country\n",
"\n",
"\n",
"def country_type(country):\n",
" if country in eu_countries:\n",
" return \"EU\"\n",
" elif country==\"China\":\n",
" return \"China\"\n",
" elif country in [\"Switzerland\", 'Norway','United Kingdom']:\n",
" return \"Non-EU associate\"\n",
" else:\n",
" return \"Other\"\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"locations = wos.groupby(record_col)[\"Addresses\"].apply(lambda x: x.str.split('[')).explode().reset_index().drop(columns=\"level_1\")\n",
"\n",
"\n",
"locations = locations[locations[\"Addresses\"]!=\"\"].copy()\n",
"locations[\"Address\"] = locations[\"Addresses\"].apply(lambda x:x.split(\"]\")[-1])\n",
"locations[\"Authors_of_address\"] = locations[\"Addresses\"].apply(lambda x:x.split(\"]\")[0])"
]
},
{
"cell_type": "code",
"execution_count": 19,
"outputs": [
{
"data": {
"text/plain": "312820"
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(locations)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 20,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Authors_of_address \n0 WOS:000208837000001 Gitzhofer, Francois \\\n1 WOS:000208837000001 Guo, Wei-Chao; Papeleux, Luc; Ponthot, Jean-Ph... \n2 WOS:000208837000001 Guo, Wei-Chao; Zhang, Wei-Hong \n3 WOS:000208837000001 Rauchs, Gast \n4 WOS:000208863600013 Hu, Baolan \n.. ... ... \n95 WOS:000209546000001 Salahuddin, Nawal \n96 WOS:000209546000001 Shrestha, Babu Raja \n97 WOS:000209546000001 Tan, Cheng Cheng \n98 WOS:000209546000001 Tang, Yao-Qing \n99 WOS:000209546000001 Tu, Mei-Lien \n\n Address \n0 Univ Sherbrooke, Dept Chem Engn, Plasma Techno... \n1 Univ Liege, Aerosp & Mech Engn Dept, LTAS MN2L... \n2 Northwestern Polytech Univ, Key Lab Contempora... \n3 Ctr Rech Publ Henri Tudor, Dept Adv Mat & Stru... \n4 Zhejiang Univ, Dept Environm Engn, Hangzhou 31... \n.. ... \n95 Aga Khan Univ & Hosp, Dept Med, Pulm & Crit Ca... \n96 Kathmandu Med Coll Teaching Hosp, Dept Anesthe... \n97 Sultanah Aminah Hosp, Dept Anaesthesia & Inten... \n98 Shanghai Jiao Tong Univ, Sch Med, Ruijin Hosp,... \n99 Chang Gung Mem Hosp, Kaohsiung Med Ctr, Dept R... \n\n[100 rows x 3 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Authors_of_address</th>\n <th>Address</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000208837000001</td>\n <td>Gitzhofer, Francois</td>\n <td>Univ Sherbrooke, Dept Chem Engn, Plasma Techno...</td>\n </tr>\n <tr>\n <th>1</th>\n <td>WOS:000208837000001</td>\n <td>Guo, Wei-Chao; Papeleux, Luc; Ponthot, Jean-Ph...</td>\n <td>Univ Liege, Aerosp &amp; Mech Engn Dept, LTAS MN2L...</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000208837000001</td>\n <td>Guo, Wei-Chao; Zhang, Wei-Hong</td>\n <td>Northwestern Polytech Univ, Key Lab Contempora...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000208837000001</td>\n <td>Rauchs, Gast</td>\n <td>Ctr Rech Publ Henri Tudor, Dept Adv Mat &amp; Stru...</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000208863600013</td>\n <td>Hu, Baolan</td>\n <td>Zhejiang Univ, Dept Environm Engn, Hangzhou 31...</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>95</th>\n <td>WOS:000209546000001</td>\n <td>Salahuddin, Nawal</td>\n <td>Aga Khan Univ &amp; Hosp, Dept Med, Pulm &amp; Crit Ca...</td>\n </tr>\n <tr>\n <th>96</th>\n <td>WOS:000209546000001</td>\n <td>Shrestha, Babu Raja</td>\n <td>Kathmandu Med Coll Teaching Hosp, Dept Anesthe...</td>\n </tr>\n <tr>\n <th>97</th>\n <td>WOS:000209546000001</td>\n <td>Tan, Cheng Cheng</td>\n <td>Sultanah Aminah Hosp, Dept Anaesthesia &amp; Inten...</td>\n </tr>\n <tr>\n <th>98</th>\n <td>WOS:000209546000001</td>\n <td>Tang, Yao-Qing</td>\n <td>Shanghai Jiao Tong Univ, Sch Med, Ruijin Hosp,...</td>\n </tr>\n <tr>\n <th>99</th>\n <td>WOS:000209546000001</td>\n <td>Tu, Mei-Lien</td>\n <td>Chang Gung Mem Hosp, Kaohsiung Med Ctr, Dept R...</td>\n </tr>\n </tbody>\n</table>\n<p>100 rows × 3 columns</p>\n</div>"
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"locations[\"Address\"] = locations[\"Address\"].str.strip().str.strip(\";\")\n",
"locations = locations.groupby([record_col,\"Authors_of_address\"])[\"Address\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_2\")\n",
"locations.head(100)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 21,
"outputs": [],
"source": [
"# import dask.dataframe as dd\n",
"#\n",
"# locations_ddf = dd.from_pandas(locations, npartitions=4) # convert pandas DataFrame to Dask DataFrame\n",
"# loc_compute = locations_ddf.groupby([record_col,\"Authors_of_address\"])[\"Address\"].apply(lambda x: x.str.split(';')).explode().compute() # compute the result"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 22,
"outputs": [],
"source": [
"# locations_test = locations.head(1000)\n",
"# locations_test = locations_test.groupby([record_col,\"Authors_of_address\"])[\"Address\"].str.split(';').explode()\n",
"# locations_test"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 23,
"outputs": [],
"source": [
"\n",
"# locations[\"Country\"]=locations['Address'].apply(lambda x: extract_location(input_text=x, key='countries'))\n",
"locations[\"Country\"]=locations['Address'].apply(lambda x: x.split(\",\")[-1].strip(\" \").strip(\";\").strip(\" \"))\n",
"locations[\"Country\"]=locations['Country'].apply(lambda x: country_cleanup(x))\n",
"locations[\"City\"]=locations['Address'].apply(lambda x: extract_location(input_text=x, key='cities'))\n",
"locations[\"Country_Type\"] = locations[\"Country\"].apply(lambda x: country_type(x))"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 23,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 24,
"outputs": [],
"source": [
"scope_types = [\"EU\",\"China\",\"Non-EU associate\"]\n",
"locations=locations[locations[\"Country_Type\"].isin(scope_types)]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Address \n1 WOS:000208837000001 Univ Liege, Aerosp & Mech Engn Dept, LTAS MN2L... \\\n2 WOS:000208837000001 Northwestern Polytech Univ, Key Lab Contempora... \n3 WOS:000208837000001 Ctr Rech Publ Henri Tudor, Dept Adv Mat & Stru... \n4 WOS:000208863600013 Zhejiang Univ, Dept Environm Engn, Hangzhou 31... \n5 WOS:000208863600013 Delft Univ Technol, Dept Biotechnol, Delft, Ne... \n\n Country City Country_Type Institution \n1 Belgium Liège EU Univ Liege \n2 China Xian China Northwestern Polytech Univ \n3 Luxembourg Luxembourg EU Ctr Rech Publ Henri Tudor \n4 China Hangzhou China Zhejiang Univ \n5 Netherlands Delft EU Delft Univ Technol ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Address</th>\n <th>Country</th>\n <th>City</th>\n <th>Country_Type</th>\n <th>Institution</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>1</th>\n <td>WOS:000208837000001</td>\n <td>Univ Liege, Aerosp &amp; Mech Engn Dept, LTAS MN2L...</td>\n <td>Belgium</td>\n <td>Liège</td>\n <td>EU</td>\n <td>Univ Liege</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000208837000001</td>\n <td>Northwestern Polytech Univ, Key Lab Contempora...</td>\n <td>China</td>\n <td>Xian</td>\n <td>China</td>\n <td>Northwestern Polytech Univ</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000208837000001</td>\n <td>Ctr Rech Publ Henri Tudor, Dept Adv Mat &amp; Stru...</td>\n <td>Luxembourg</td>\n <td>Luxembourg</td>\n <td>EU</td>\n <td>Ctr Rech Publ Henri Tudor</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000208863600013</td>\n <td>Zhejiang Univ, Dept Environm Engn, Hangzhou 31...</td>\n <td>China</td>\n <td>Hangzhou</td>\n <td>China</td>\n <td>Zhejiang Univ</td>\n </tr>\n <tr>\n <th>5</th>\n <td>WOS:000208863600013</td>\n <td>Delft Univ Technol, Dept Biotechnol, Delft, Ne...</td>\n <td>Netherlands</td>\n <td>Delft</td>\n <td>EU</td>\n <td>Delft Univ Technol</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"univ_locations = locations[[record_col,\"Address\",\"Country\",\"City\",\"Country_Type\"]].copy()\n",
"univ_locations[\"Institution\"] = univ_locations[\"Address\"].apply(lambda x: x.split(\",\")[0])\n",
"univ_locations = univ_locations.drop_duplicates()\n",
"univ_locations.head()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Country Country_Type author_str_id\n0 WOS:000208837000001 Belgium EU 6079964a4094c607358a130e41e89f90\n1 WOS:000208837000001 Belgium EU 2321037fa90ac94a23b88a79f1c7f454\n2 WOS:000208837000001 Belgium EU 8a1bfa1e7bc52d323f0d9c23a9b74ed3\n3 WOS:000208837000001 China China 6079964a4094c607358a130e41e89f90\n4 WOS:000208837000001 China China 17fb036de6a4db3ba39ccab3d8307c04",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Country</th>\n <th>Country_Type</th>\n <th>author_str_id</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000208837000001</td>\n <td>Belgium</td>\n <td>EU</td>\n <td>6079964a4094c607358a130e41e89f90</td>\n </tr>\n <tr>\n <th>1</th>\n <td>WOS:000208837000001</td>\n <td>Belgium</td>\n <td>EU</td>\n <td>2321037fa90ac94a23b88a79f1c7f454</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000208837000001</td>\n <td>Belgium</td>\n <td>EU</td>\n <td>8a1bfa1e7bc52d323f0d9c23a9b74ed3</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000208837000001</td>\n <td>China</td>\n <td>China</td>\n <td>6079964a4094c607358a130e41e89f90</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000208837000001</td>\n <td>China</td>\n <td>China</td>\n <td>17fb036de6a4db3ba39ccab3d8307c04</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"author_locations = locations.groupby([record_col,\"Country\",\"Country_Type\"])[\"Authors_of_address\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_3\")\n",
"author_locations[\"Author_name\"] = author_locations[\"Authors_of_address\"].str.strip()\n",
"author_locations = author_locations.drop(columns=\"Authors_of_address\")\n",
"author_locations[\"author_str_id\"] = author_locations[\"Author_name\"].apply(lambda x:''.join(filter(str.isalnum, x.lower())))\n",
"author_locations[\"author_str_id\"] = author_locations[\"author_str_id\"].apply(md5hash)\n",
"author_locations = author_locations.drop(columns=\"Author_name\")\n",
"author_locations.head()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Country Country_Type \n0 WOS:000208837000001 Belgium EU \\\n3 WOS:000208837000001 China China \n4 WOS:000208837000001 China China \n6 WOS:000208863600013 China China \n7 WOS:000208863600013 Netherlands EU \n... ... ... ... \n643323 WOS:000964683900016 Italy EU \n643324 WOS:000964683900016 Italy EU \n643325 WOS:000967389100001 China China \n643326 WOS:000967389100001 Norway Non-EU associate \n643327 WOS:000967389100001 Norway Non-EU associate \n\n author_str_id \n0 6079964a4094c607358a130e41e89f90 \n3 6079964a4094c607358a130e41e89f90 \n4 17fb036de6a4db3ba39ccab3d8307c04 \n6 54c7bc6fe9b77434ca1bf04d763d843b \n7 df81f9da6c8f5c968c16ef0aab1bb8f9 \n... ... \n643323 3c631398a81ab7058d95a0c6418a2c0b \n643324 3c631398a81ab7058d95a0c6418a2c0b \n643325 ce65541a6c334225a9617439f4a95012 \n643326 7c52a53f8d79b1ffd4f2e4cde9548e1d \n643327 7c52a53f8d79b1ffd4f2e4cde9548e1d \n\n[573569 rows x 4 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Country</th>\n <th>Country_Type</th>\n <th>author_str_id</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000208837000001</td>\n <td>Belgium</td>\n <td>EU</td>\n <td>6079964a4094c607358a130e41e89f90</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000208837000001</td>\n <td>China</td>\n <td>China</td>\n <td>6079964a4094c607358a130e41e89f90</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000208837000001</td>\n <td>China</td>\n <td>China</td>\n <td>17fb036de6a4db3ba39ccab3d8307c04</td>\n </tr>\n <tr>\n <th>6</th>\n <td>WOS:000208863600013</td>\n <td>China</td>\n <td>China</td>\n <td>54c7bc6fe9b77434ca1bf04d763d843b</td>\n </tr>\n <tr>\n <th>7</th>\n <td>WOS:000208863600013</td>\n <td>Netherlands</td>\n <td>EU</td>\n <td>df81f9da6c8f5c968c16ef0aab1bb8f9</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>643323</th>\n <td>WOS:000964683900016</td>\n <td>Italy</td>\n <td>EU</td>\n <td>3c631398a81ab7058d95a0c6418a2c0b</td>\n </tr>\n <tr>\n <th>643324</th>\n <td>WOS:000964683900016</td>\n <td>Italy</td>\n <td>EU</td>\n <td>3c631398a81ab7058d95a0c6418a2c0b</td>\n </tr>\n <tr>\n <th>643325</th>\n <td>WOS:000967389100001</td>\n <td>China</td>\n <td>China</td>\n <td>ce65541a6c334225a9617439f4a95012</td>\n </tr>\n <tr>\n <th>643326</th>\n <td>WOS:000967389100001</td>\n <td>Norway</td>\n <td>Non-EU associate</td>\n <td>7c52a53f8d79b1ffd4f2e4cde9548e1d</td>\n </tr>\n <tr>\n <th>643327</th>\n <td>WOS:000967389100001</td>\n <td>Norway</td>\n <td>Non-EU associate</td>\n <td>7c52a53f8d79b1ffd4f2e4cde9548e1d</td>\n </tr>\n </tbody>\n</table>\n<p>573569 rows × 4 columns</p>\n</div>"
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"author_locations[author_locations['author_str_id'].duplicated(False)]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"author_primary_region = author_locations.sort_values(by=\"Country_Type\").drop_duplicates(subset=[record_col,\"author_str_id\"])\n",
"# author_primary_region\n",
"\n",
"china=author_primary_region[author_primary_region[\"Country_Type\"]==\"China\"][record_col].unique()\n",
"eu=author_primary_region[author_primary_region[\"Country_Type\"]==\"EU\"][record_col].unique()\n",
"assoc=author_primary_region[author_primary_region[\"Country_Type\"]==\"Non-EU associate\"][record_col].unique()\n",
"\n",
"\n",
"# records that have distinct authors with different country affiliations\n",
"valid_scope = wos[((wos[record_col].isin(china))\n",
" &\n",
" ((wos[record_col].isin(eu))\n",
" |\n",
" (wos[record_col].isin(assoc))))][record_col].unique()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Country Country_Type \n537692 WOS:000732204600001 China China \\\n204027 WOS:000414089800001 China China \n204028 WOS:000414089800001 China China \n204029 WOS:000414089800001 China China \n204030 WOS:000414090800001 China China \n\n author_str_id \n537692 8fe31cbbd07c639aa4d779688896be81 \n204027 67c7beb18fafd77f1319739fa683bc5e \n204028 7269f0a31fc620688aae12aad9e3cd85 \n204029 ac28aea698a527fb5195d3d24189ea04 \n204030 6c91bf481b6bddc1426d12a18823224a ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Country</th>\n <th>Country_Type</th>\n <th>author_str_id</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>537692</th>\n <td>WOS:000732204600001</td>\n <td>China</td>\n <td>China</td>\n <td>8fe31cbbd07c639aa4d779688896be81</td>\n </tr>\n <tr>\n <th>204027</th>\n <td>WOS:000414089800001</td>\n <td>China</td>\n <td>China</td>\n <td>67c7beb18fafd77f1319739fa683bc5e</td>\n </tr>\n <tr>\n <th>204028</th>\n <td>WOS:000414089800001</td>\n <td>China</td>\n <td>China</td>\n <td>7269f0a31fc620688aae12aad9e3cd85</td>\n </tr>\n <tr>\n <th>204029</th>\n <td>WOS:000414089800001</td>\n <td>China</td>\n <td>China</td>\n <td>ac28aea698a527fb5195d3d24189ea04</td>\n </tr>\n <tr>\n <th>204030</th>\n <td>WOS:000414090800001</td>\n <td>China</td>\n <td>China</td>\n <td>6c91bf481b6bddc1426d12a18823224a</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"author_primary_region.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of records: 51904\n",
"Number of valid cooperation records: 46060\n"
]
}
],
"source": [
"print(f'Number of records: {len(wos)}')\n",
"print(f'Number of valid cooperation records: {len(valid_scope)}')"
]
},
{
"cell_type": "code",
"execution_count": 31,
"outputs": [],
"source": [
"wos = wos[wos[record_col].isin(valid_scope)]\n",
"locations = locations[locations[record_col].isin(valid_scope)]\n",
"univ_locations = univ_locations[univ_locations[record_col].isin(valid_scope)]\n",
"author_locations = author_locations[author_locations[record_col].isin(valid_scope)]\n",
"author_primary_region = author_locations[author_locations[record_col].isin(valid_scope)]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"affiliations = wos.groupby(record_col)[\"Affiliations\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
"affiliations[\"Affiliations\"] = affiliations[\"Affiliations\"].str.strip().str.upper().fillna(\"UNKNOWN\")\n",
"affiliations = affiliations.drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"outputs": [
{
"data": {
"text/plain": "Affiliations\nCHINESE ACADEMY OF SCIENCES 5616\nUNIVERSITY OF LONDON 2604\nUDICE-FRENCH RESEARCH UNIVERSITIES 2240\nCENTRE NATIONAL DE LA RECHERCHE SCIENTIFIQUE (CNRS) 2170\nTSINGHUA UNIVERSITY 1935\n ... \nUNIVERSITY OF FUKUI 1\nPONTIFICIA UNIVERSIDADE CATOLICA DE GOIAS 1\nINSTITUTE OF ORGANIC CHEMISTRY & BIOCHEMISTRY OF THE CZECH ACADEMY OF SCIENCES 1\nUNIVERSITAS PELITA HARAPAN 1\nFRANCISCUS GASTHUIS 1\nName: count, Length: 7609, dtype: int64"
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"affiliations[\"Affiliations\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 34,
"outputs": [
{
"data": {
"text/plain": "Institution\nChinese Acad Sci 5749\nTsinghua Univ 2315\nShanghai Jiao Tong Univ 1976\nZhejiang Univ 1806\nPeking Univ 1661\n ... \nNatl Technol Inst Mental Disorders 1\nSeinajoki Univ Appl Sci 1\nJD Intelligent City Res 1\nCAS Ctr Excellence Planetol 1\nKey Lab Intelligent Prevent Med Zhejiang Prov 1\nName: count, Length: 19821, dtype: int64"
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"univ_locations[\"Institution\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 35,
"outputs": [
{
"data": {
"text/plain": "46060"
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"univ_locations[record_col].nunique()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 36,
"outputs": [
{
"data": {
"text/plain": "46060"
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"affiliations[record_col].nunique()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 37,
"outputs": [
{
"data": {
"text/plain": "202790"
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"univ_locations[\"Institution\"].value_counts().sum()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 38,
"outputs": [
{
"data": {
"text/plain": "268471"
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"affiliations[\"Affiliations\"].value_counts().sum()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "WoS Categories\n Engineering, Electrical & Electronic 8303\nComputer Science, Artificial Intelligence 6115\n Telecommunications 4661\nComputer Science, Information Systems 4584\nEngineering, Electrical & Electronic 4036\n ... \nCultural Studies 1\n Ornithology 1\n Criminology & Penology 1\nArt 1\n Psychology, Developmental 1\nName: count, Length: 425, dtype: int64"
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos_cat = wos.groupby(record_col)[\"WoS Categories\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
"wos_cat[\"WoS Categories\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "Research Areas\nEngineering 18098\nComputer Science 15658\nTelecommunications 5046\nEnvironmental Sciences & Ecology 3246\nImaging Science & Photographic Technology 2947\n ... \nFilm, Radio & Television 2\nArea Studies 2\nCultural Studies 1\nAsian Studies 1\nMusic 1\nName: count, Length: 145, dtype: int64"
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos_areas = wos.groupby(record_col)[\"Research Areas\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
"wos_areas[\"Research Areas\"] = wos_areas[\"Research Areas\"].str.strip()\n",
"wos_areas[\"Research Areas\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "['Domain_English', 'Field_English', 'SubField_English']"
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[c for c in wos.columns if \"_English\" in c]"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"metrix_levels = [c for c in wos.columns if \"_English\" in c]\n",
"for m in metrix_levels:\n",
" wos[m] = wos[m].replace({\"article-level classification\":\"Multidisciplinary\"})\n"
]
},
{
"cell_type": "code",
"execution_count": 42,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": " Publication Type Authors \n0 J Yan, Z; Jing, XY; Pedrycz, W \\\n1 J Sookhak, M; Yu, FR; He, Y; Talebian, H; Safa, ... \n2 J Ning, ZL; Dong, PR; Wang, XJ; Guo, L; Rodrigue... \n3 J Wang, XD; Garg, S; Lin, H; Kaddoum, G; Hu, J; ... \n4 J Lu, TG; Chen, XY; McElroy, MB; Nielsen, CP; Wu... \n... ... ... \n51897 J Lai, ZL; Liu, W; Jian, XD; Bacsa, K; Sun, LM; ... \n51898 J Wang, HC; Roussel, P; Denby, B \n51899 J Zhang, R; Alpdogan, S; Kong, SQ; Muhammad, S \n51902 J Chu, WP; Song, Y \n51903 J Lai, CS; Jia, YW; Dong, ZK; Wang, DX; Tao, YS;... \n\n Book Authors Book Editors Book Group Authors \n0 NaN NaN NaN \\\n1 NaN NaN NaN \n2 NaN NaN NaN \n3 NaN NaN NaN \n4 NaN NaN NaN \n... ... ... ... \n51897 NaN NaN NaN \n51898 NaN NaN NaN \n51899 NaN NaN NaN \n51902 NaN NaN NaN \n51903 NaN NaN NaN \n\n Author Full Names \n0 Yan, Zheng; Jing, Xuyang; Pedrycz, Witold \\\n1 Sookhak, Mehdi; Yu, F. Richard; He, Ying; Tale... \n2 Ning, Zhaolong; Dong, Peiran; Wang, Xiaojie; G... \n3 Wang, Xiaoding; Garg, Sahil; Lin, Hui; Kaddoum... \n4 Lu, Tianguang; Chen, Xinyu; McElroy, Michael B... \n... ... \n51897 Lai, Zhilu; Liu, Wei; Jian, Xudong; Bacsa, Kir... \n51898 Wang, Hongcui; Roussel, Pierre; Denby, Bruce \n51899 Zhang, Rui; Alpdogan, Serdar; Kong, Shiqi; Muh... \n51902 Chu, Wenping; Song, Yang \n51903 Lai, Chun Sing; Jia, Youwei; Dong, Zhekang; Wa... \n\n Book Author Full Names Group Authors \n0 NaN NaN \\\n1 NaN NaN \n2 NaN NaN \n3 NaN NaN \n4 NaN NaN \n... ... ... \n51897 NaN NaN \n51898 NaN NaN \n51899 NaN NaN \n51902 NaN NaN \n51903 NaN NaN \n\n Article Title \n0 LEFusing and mining opinions for reputation ge... \\\n1 FOG VEHICULAR COMPUTING Augmentation of Fog Co... \n2 Deep Reinforcement Learning for Intelligent In... \n3 An Intelligent UAV based Data Aggregation Algo... \n4 A Reinforcement Learning-Based Decision System... \n... ... \n51897 Neural modal ordinary differential equations: ... \n51898 Improving ultrasound-based multimodal speech r... \n51899 Application of computer-aided image reconstruc... \n51902 Study on Dynamic Interaction of Railway Pantog... \n51903 A Review of Technical Standards for Smart Cities \n\n Source Title ... \n0 INFORMATION FUSION ... \\\n1 IEEE VEHICULAR TECHNOLOGY MAGAZINE ... \n2 IEEE TRANSACTIONS ON COGNITIVE COMMUNICATIONS ... ... \n3 COMPUTER NETWORKS ... \n4 IEEE TRANSACTIONS ON SMART GRID ... \n... ... ... \n51897 DATA-CENTRIC ENGINEERING ... \n51898 JASA EXPRESS LETTERS ... \n51899 EGYPTIAN JOURNAL OF NEUROSURGERY ... \n51902 VIBRATION ... \n51903 CLEAN TECHNOLOGIES ... \n\n UT (Unique WOS ID) issn_var issn Domain_English \n0 WOS:000394070100013 issn 15662535 Applied Sciences \\\n1 WOS:000408568800008 issn 15566072 Applied Sciences \n2 WOS:000502789700018 issn 23327731 Applied Sciences \n3 WOS:000626758800004 issn 13891286 Applied Sciences \n4 WOS:000641976000028 issn 19493053 Applied Sciences \n... ... ... ... ... \n51897 WOS:000906995300001 eissn NaN Applied Sciences \n51898 WOS:000642230800005 eissn NaN Natural Sciences \n51899 WOS:000807222600001 eissn NaN Health Sciences \n51902 WOS:000661660800001 eissn NaN Applied Sciences \n51903 WOS:000708219500008 eissn NaN Natural Sciences \n\n Field_English \n0 Information & Communication Technologies \\\n1 Information & Communication Technologies \n2 Information & Communication Technologies \n3 Information & Communication Technologies \n4 Enabling & Strategic Technologies \n... ... \n51897 Information & Communication Technologies \n51898 Physics & Astronomy \n51899 Clinical Medicine \n51902 Engineering \n51903 Earth & Environmental Sciences \n\n SubField_English 2.00 SEQ \n0 Artificial Intelligence & Image Processing 31 \\\n1 Networking & Telecommunications 37 \n2 Networking & Telecommunications 37 \n3 Networking & Telecommunications 37 \n4 Energy 14 \n... ... ... \n51897 Artificial Intelligence & Image Processing NaN \n51898 Acoustics NaN \n51899 Neurology & Neurosurgery NaN \n51902 Mechanical Engineering & Transports NaN \n51903 Environmental Sciences NaN \n\n Source_title srcid \n0 Information Fusion 2.609900e+04 \\\n1 IEEE Vehicular Technology Magazine 5.200153e+09 \n2 IEEE Transactions on Cognitive Communications ... 2.110085e+10 \n3 Computer Networks 2.681100e+04 \n4 IEEE Transactions on Smart Grid 1.970017e+10 \n... ... ... \n51897 NaN NaN \n51898 NaN NaN \n51899 NaN NaN \n51902 NaN NaN \n51903 NaN NaN \n\n issn_type \n0 issn1 \n1 issn1 \n2 issn1 \n3 issn1 \n4 issn2 \n... ... \n51897 NaN \n51898 NaN \n51899 NaN \n51902 NaN \n51903 NaN \n\n[46060 rows x 80 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Publication Type</th>\n <th>Authors</th>\n <th>Book Authors</th>\n <th>Book Editors</th>\n <th>Book Group Authors</th>\n <th>Author Full Names</th>\n <th>Book Author Full Names</th>\n <th>Group Authors</th>\n <th>Article Title</th>\n <th>Source Title</th>\n <th>...</th>\n <th>UT (Unique WOS ID)</th>\n <th>issn_var</th>\n <th>issn</th>\n <th>Domain_English</th>\n <th>Field_English</th>\n <th>SubField_English</th>\n <th>2.00 SEQ</th>\n <th>Source_title</th>\n <th>srcid</th>\n <th>issn_type</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>J</td>\n <td>Yan, Z; Jing, XY; Pedrycz, W</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Yan, Zheng; Jing, Xuyang; Pedrycz, Witold</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>LEFusing and mining opinions for reputation ge...</td>\n <td>INFORMATION FUSION</td>\n <td>...</td>\n <td>WOS:000394070100013</td>\n <td>issn</td>\n <td>15662535</td>\n <td>Applied Sciences</td>\n <td>Information &amp; Communication Technologies</td>\n <td>Artificial Intelligence &amp; Image Processing</td>\n <td>31</td>\n <td>Information Fusion</td>\n <td>2.609900e+04</td>\n <td>issn1</td>\n </tr>\n <tr>\n <th>1</th>\n <td>J</td>\n <td>Sookhak, M; Yu, FR; He, Y; Talebian, H; Safa, ...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Sookhak, Mehdi; Yu, F. Richard; He, Ying; Tale...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>FOG VEHICULAR COMPUTING Augmentation of Fog Co...</td>\n <td>IEEE VEHICULAR TECHNOLOGY MAGAZINE</td>\n <td>...</td>\n <td>WOS:000408568800008</td>\n <td>issn</td>\n <td>15566072</td>\n <td>Applied Sciences</td>\n <td>Information &amp; Communication Technologies</td>\n <td>Networking &amp; Telecommunications</td>\n <td>37</td>\n <td>IEEE Vehicular Technology Magazine</td>\n <td>5.200153e+09</td>\n <td>issn1</td>\n </tr>\n <tr>\n <th>2</th>\n <td>J</td>\n <td>Ning, ZL; Dong, PR; Wang, XJ; Guo, L; Rodrigue...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Ning, Zhaolong; Dong, Peiran; Wang, Xiaojie; G...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Deep Reinforcement Learning for Intelligent In...</td>\n <td>IEEE TRANSACTIONS ON COGNITIVE COMMUNICATIONS ...</td>\n <td>...</td>\n <td>WOS:000502789700018</td>\n <td>issn</td>\n <td>23327731</td>\n <td>Applied Sciences</td>\n <td>Information &amp; Communication Technologies</td>\n <td>Networking &amp; Telecommunications</td>\n <td>37</td>\n <td>IEEE Transactions on Cognitive Communications ...</td>\n <td>2.110085e+10</td>\n <td>issn1</td>\n </tr>\n <tr>\n <th>3</th>\n <td>J</td>\n <td>Wang, XD; Garg, S; Lin, H; Kaddoum, G; Hu, J; ...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Wang, Xiaoding; Garg, Sahil; Lin, Hui; Kaddoum...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>An Intelligent UAV based Data Aggregation Algo...</td>\n <td>COMPUTER NETWORKS</td>\n <td>...</td>\n <td>WOS:000626758800004</td>\n <td>issn</td>\n <td>13891286</td>\n <td>Applied Sciences</td>\n <td>Information &amp; Communication Technologies</td>\n <td>Networking &amp; Telecommunications</td>\n <td>37</td>\n <td>Computer Networks</td>\n <td>2.681100e+04</td>\n <td>issn1</td>\n </tr>\n <tr>\n <th>4</th>\n <td>J</td>\n <td>Lu, TG; Chen, XY; McElroy, MB; Nielsen, CP; Wu...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Lu, Tianguang; Chen, Xinyu; McElroy, Michael B...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>A Reinforcement Learning-Based Decision System...</td>\n <td>IEEE TRANSACTIONS ON SMART GRID</td>\n <td>...</td>\n <td>WOS:000641976000028</td>\n <td>issn</td>\n <td>19493053</td>\n <td>Applied Sciences</td>\n <td>Enabling &amp; Strategic Technologies</td>\n <td>Energy</td>\n <td>14</td>\n <td>IEEE Transactions on Smart Grid</td>\n <td>1.970017e+10</td>\n <td>issn2</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>51897</th>\n <td>J</td>\n <td>Lai, ZL; Liu, W; Jian, XD; Bacsa, K; Sun, LM; ...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Lai, Zhilu; Liu, Wei; Jian, Xudong; Bacsa, Kir...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Neural modal ordinary differential equations: ...</td>\n <td>DATA-CENTRIC ENGINEERING</td>\n <td>...</td>\n <td>WOS:000906995300001</td>\n <td>eissn</td>\n <td>NaN</td>\n <td>Applied Sciences</td>\n <td>Information &amp; Communication Technologies</td>\n <td>Artificial Intelligence &amp; Image Processing</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>51898</th>\n <td>J</td>\n <td>Wang, HC; Roussel, P; Denby, B</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Wang, Hongcui; Roussel, Pierre; Denby, Bruce</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Improving ultrasound-based multimodal speech r...</td>\n <td>JASA EXPRESS LETTERS</td>\n <td>...</td>\n <td>WOS:000642230800005</td>\n <td>eissn</td>\n <td>NaN</td>\n <td>Natural Sciences</td>\n <td>Physics &amp; Astronomy</td>\n <td>Acoustics</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>51899</th>\n <td>J</td>\n <td>Zhang, R; Alpdogan, S; Kong, SQ; Muhammad, S</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Zhang, Rui; Alpdogan, Serdar; Kong, Shiqi; Muh...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Application of computer-aided image reconstruc...</td>\n <td>EGYPTIAN JOURNAL OF NEUROSURGERY</td>\n <td>...</td>\n <td>WOS:000807222600001</td>\n <td>eissn</td>\n <td>NaN</td>\n <td>Health Sciences</td>\n <td>Clinical Medicine</td>\n <td>Neurology &amp; Neurosurgery</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>51902</th>\n <td>J</td>\n <td>Chu, WP; Song, Y</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Chu, Wenping; Song, Yang</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Study on Dynamic Interaction of Railway Pantog...</td>\n <td>VIBRATION</td>\n <td>...</td>\n <td>WOS:000661660800001</td>\n <td>eissn</td>\n <td>NaN</td>\n <td>Applied Sciences</td>\n <td>Engineering</td>\n <td>Mechanical Engineering &amp; Transports</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>51903</th>\n <td>J</td>\n <td>Lai, CS; Jia, YW; Dong, ZK; Wang, DX; Tao, YS;...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Lai, Chun Sing; Jia, Youwei; Dong, Zhekang; Wa...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>A Review of Technical Standards for Smart Cities</td>\n <td>CLEAN TECHNOLOGIES</td>\n <td>...</td>\n <td>WOS:000708219500008</td>\n <td>eissn</td>\n <td>NaN</td>\n <td>Natural Sciences</td>\n <td>Earth &amp; Environmental Sciences</td>\n <td>Environmental Sciences</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n </tbody>\n</table>\n<p>46060 rows × 80 columns</p>\n</div>"
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "['Domain_English', 'Field_English', 'SubField_English']"
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"metrix_levels"
]
},
{
"cell_type": "code",
"execution_count": 45,
"outputs": [],
"source": [
"record_countries = locations[[record_col,\"Country\"]].drop_duplicates()\n",
"record_author_locations = author_locations[[record_col,\"author_str_id\",\"Country\"]].drop_duplicates()\n",
"record_institution = univ_locations[[record_col,\"Institution\",\"Country\"]].drop_duplicates()\n",
"country_types = locations[[\"Country\",\"Country_Type\"]].drop_duplicates()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 46,
"outputs": [],
"source": [
"# Basic network layout"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 47,
"outputs": [],
"source": [
"country_collabs = record_countries.merge(record_countries, on=record_col)\n",
"country_collabs = country_collabs[country_collabs[\"Country_x\"]!=country_collabs[\"Country_y\"]]\n",
"country_collabs[\"weight\"] = 0.5"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 48,
"outputs": [],
"source": [
"inst_collabs = record_institution.merge(record_institution, on=record_col)\n",
"inst_collabs = inst_collabs[inst_collabs[\"Institution_x\"]!=inst_collabs[\"Institution_y\"]]\n",
"inst_collabs[\"weight\"] = 0.5"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 49,
"outputs": [
{
"data": {
"text/plain": "Index(['Publication Type', 'Authors', 'Book Authors', 'Book Editors',\n 'Book Group Authors', 'Author Full Names', 'Book Author Full Names',\n 'Group Authors', 'Article Title', 'Source Title', 'Book Series Title',\n 'Book Series Subtitle', 'Language', 'Document Type', 'Conference Title',\n 'Conference Date', 'Conference Location', 'Conference Sponsor',\n 'Conference Host', 'Author Keywords', 'Keywords Plus', 'Abstract',\n 'Addresses', 'Affiliations', 'Reprint Addresses', 'Email Addresses',\n 'Researcher Ids', 'ORCIDs', 'Funding Orgs', 'Funding Name Preferred',\n 'Funding Text', 'Cited References', 'Cited Reference Count',\n 'Times Cited, WoS Core', 'Times Cited, All Databases',\n '180 Day Usage Count', 'Since 2013 Usage Count', 'Publisher',\n 'Publisher City', 'Publisher Address', 'ISSN', 'eISSN', 'ISBN',\n 'Journal Abbreviation', 'Journal ISO Abbreviation', 'Publication Date',\n 'Publication Year', 'Volume', 'Issue', 'Part Number', 'Supplement',\n 'Special Issue', 'Meeting Abstract', 'Start Page', 'End Page',\n 'Article Number', 'DOI', 'DOI Link', 'Book DOI', 'Early Access Date',\n 'Number of Pages', 'WoS Categories', 'Web of Science Index',\n 'Research Areas', 'IDS Number', 'Pubmed Id', 'Open Access Designations',\n 'Highly Cited Status', 'Hot Paper Status', 'Date of Export',\n 'UT (Unique WOS ID)', 'issn_var', 'issn', 'Domain_English',\n 'Field_English', 'SubField_English', '2.00 SEQ', 'Source_title',\n 'srcid', 'issn_type'],\n dtype='object')"
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos.columns"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 50,
"outputs": [
{
"data": {
"text/plain": "['Authors',\n 'Book Authors',\n 'Book Editors',\n 'Book Group Authors',\n 'Author Full Names',\n 'Book Author Full Names',\n 'Group Authors',\n 'Addresses',\n 'Reprint Addresses',\n 'Email Addresses',\n 'Researcher Ids',\n 'ORCIDs',\n 'Publisher Address',\n '2.00 SEQ']"
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"drop_cols = [ws for ws in wos.columns if ((\"uthor\" in ws or \"ddress\" in ws or \"ORCID\" in\n",
" ws or \"esearcher\" in ws or \"ditor\" in ws or \"name\" in ws or 'SEQ' in ws) and \"eyword\" not in ws)]\n",
"drop_cols"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 51,
"outputs": [],
"source": [
"outdir=\"wos_processed_data\""
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 52,
"outputs": [],
"source": [
"os.makedirs(outdir, exist_ok=True)\n",
"\n",
"wos.drop(columns=drop_cols).to_excel(f\"{outdir}/wos_processed.xlsx\", index=False)\n",
"\n",
"record_countries.to_excel(f\"{outdir}/wos_countries.xlsx\", index=False)\n",
"\n",
"record_author_locations.to_excel(f\"{outdir}/wos_author_locations.xlsx\", index=False)\n",
"\n",
"record_institution.to_excel(f\"{outdir}/wos_institution_locations.xlsx\", index=False)\n",
"\n",
"kw_df.to_excel(f\"{outdir}/wos_keywords.xlsx\", index=False)\n",
"\n",
"country_types.to_excel(f\"{outdir}/wos_country_types.xlsx\", index=False)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 53,
"outputs": [],
"source": [
"wos.drop(columns=drop_cols).to_csv(f\"{outdir}/wos_processed.csv\", index=False, sep='\\t')\n",
"\n",
"record_countries.to_csv(f\"{outdir}/wos_countries.csv\", index=False, sep='\\t')\n",
"\n",
"record_author_locations.to_csv(f\"{outdir}/wos_author_locations.csv\", index=False, sep='\\t')\n",
"\n",
"record_institution.to_csv(f\"{outdir}/wos_institution_locations.csv\", index=False, sep='\\t')\n",
"\n",
"kw_df.to_csv(f\"{outdir}/wos_keywords.csv\", index=False, sep='\\t')\n",
"\n",
"country_types.to_csv(f\"{outdir}/wos_country_types.csv\", index=False, sep='\\t')\n",
"\n",
"inst_collabs.to_csv(f\"{outdir}/wos_inst_collabs.csv\", index=False, sep='\\t')\n",
"\n",
"country_collabs.to_csv(f\"{outdir}/wos_country_collabs.csv\", index=False, sep='\\t')"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 54,
"outputs": [],
"source": [
"wos_areas.to_csv(f\"{outdir}/wos_research_areas.csv\", index=False, sep='\\t')\n",
"\n",
"wos_subcat.to_csv(f\"{outdir}/wos_categories.csv\", index=False, sep='\\t')"
],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
}
},
"nbformat": 4,
"nbformat_minor": 1
}