You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ZSI_Reconnect_China/WOS/wos_processing_pipeline.ipynb

1190 lines
96 KiB
Plaintext

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import os\n",
"import shutil\n",
"from flashgeotext.geotext import GeoText\n",
"import re"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [],
"source": [
"import hashlib\n",
"\n",
"def md5hash(s: str):\n",
" return hashlib.md5(s.encode('utf-8')).hexdigest()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"record_col=\"UT (Unique WOS ID)\"\n",
"outfile = r\"C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\WOS\\wos_extract\\wos_records_concat.csv\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": " Publication Type Authors \n20209 J Shuang, K; Gu, MY; Li, R; Loo, J; Su, S \\\n9308 J Kuo, CY; Schaarschmidt, A; Cui, YD; Asfour, T;... \n26162 J Jin, BT; Zhou, ZH; Zou, J \n9129 J Sun, QM; Zhang, X; Banerjee, S; Bao, P; Barbry... \n24590 J Jiang, XH; Zhang, WY; Fernie, AR; Wen, WW \n... ... ... \n30650 J Guan, ZT; Zhang, Y; Si, GL; Zhou, ZY; Wu, J; M... \n31337 J Liu, H; Zhang, Y; Yang, T \n4273 J Zhou, YL; Chang, FJ; Chen, H; Li, H \n6761 C Li, Y; Tao, JH; Schuller, B; Shan, SG; Jiang, ... \n41900 J Shang, RH; Kong, JR; Zhang, WT; Feng, J; Jiao,... \n\n Book Authors Book Editors \n20209 NaN NaN \\\n9308 NaN NaN \n26162 NaN NaN \n9129 NaN NaN \n24590 NaN NaN \n... ... ... \n30650 NaN NaN \n31337 NaN NaN \n4273 NaN NaN \n6761 NaN Tan, T; Li, X; Chen, X; Zhou, J; Yang, J; Chen... \n41900 NaN NaN \n\n Book Group Authors Author Full Names \n20209 NaN Shuang, Kai; Gu, Mengyu; Li, Rui; Loo, Jonatha... \\\n9308 NaN Kuo, Cheng-Yu; Schaarschmidt, Andreas; Cui, Yu... \n26162 NaN Jin, Bangti; Zhou, Zehui; Zou, Jun \n9129 NaN Sun, Qiming; Zhang, Xing; Banerjee, Samragni; ... \n24590 NaN Jiang, Xiaohui; Zhang, Weiyi; Fernie, Alisdair... \n... ... ... \n30650 NaN Guan, Zhitao; Zhang, Yue; Si, Guanlin; Zhou, Z... \n31337 NaN Liu, Hong; Zhang, Yan; Yang, Tao \n4273 NaN Zhou, Yanlai; Chang, Fi-John; Chen, Hua; Li, Hong \n6761 NaN Li, Ya; Tao, Jianhua; Schuller, Bjoern; Shan, ... \n41900 NaN Shang, Ronghua; Kong, Jiarui; Zhang, Weitong; ... \n\n Book Author Full Names Group Authors \n20209 NaN NaN \\\n9308 NaN NaN \n26162 NaN NaN \n9129 NaN NaN \n24590 NaN NaN \n... ... ... \n30650 NaN NaN \n31337 NaN NaN \n4273 NaN NaN \n6761 NaN NaN \n41900 NaN NaN \n\n Article Title \n20209 Interactive POS-aware network for aspect-level... \\\n9308 Uncertainty-Aware Contact-Safe Model-Based Rei... \n26162 An analysis of stochastic variance reduced gra... \n9129 Recent developments in the PySCF program package \n24590 Combining novel technologies with interdiscipl... \n... ... \n30650 ECOSECURITY: Tackling Challenges Related to Da... \n31337 Blockchain-Enabled Security in Electric Vehicl... \n4273 Exploring Copula-based Bayesian Model Averagin... \n6761 MEC 2016: The Multimodal Emotion Recognition C... \n41900 Uncorrelated feature selection via sparse late... \n\n Source Title ... \n20209 NEUROCOMPUTING ... \\\n9308 IEEE ROBOTICS AND AUTOMATION LETTERS ... \n26162 INVERSE PROBLEMS ... \n9129 JOURNAL OF CHEMICAL PHYSICS ... \n24590 PLANT JOURNAL ... \n... ... ... \n30650 IEEE CONSUMER ELECTRONICS MAGAZINE ... \n31337 IEEE NETWORK ... \n4273 JOURNAL OF CLEANER PRODUCTION ... \n6761 PATTERN RECOGNITION (CCPR 2016), PT II ... \n41900 PATTERN RECOGNITION ... \n\n WoS Categories \n20209 Computer Science, Artificial Intelligence \\\n9308 Robotics \n26162 Mathematics, Applied; Physics, Mathematical \n9129 Chemistry, Physical; Physics, Atomic, Molecula... \n24590 Plant Sciences \n... ... \n30650 Computer Science, Hardware & Architecture; Eng... \n31337 Computer Science, Hardware & Architecture; Com... \n4273 Green & Sustainable Science & Technology; Engi... \n6761 Computer Science, Artificial Intelligence; Com... \n41900 Computer Science, Artificial Intelligence; Eng... \n\n Web of Science Index \n20209 Science Citation Index Expanded (SCI-EXPANDED) \\\n9308 Science Citation Index Expanded (SCI-EXPANDED) \n26162 Science Citation Index Expanded (SCI-EXPANDED) \n9129 Science Citation Index Expanded (SCI-EXPANDED) \n24590 Science Citation Index Expanded (SCI-EXPANDED) \n... ... \n30650 Science Citation Index Expanded (SCI-EXPANDED) \n31337 Science Citation Index Expanded (SCI-EXPANDED) \n4273 Science Citation Index Expanded (SCI-EXPANDED) \n6761 Conference Proceedings Citation Index - Scienc... \n41900 Science Citation Index Expanded (SCI-EXPANDED) \n\n Research Areas IDS Number \n20209 Computer Science PI6QI \\\n9308 Robotics RK6JU \n26162 Mathematics; Physics YB5KT \n9129 Chemistry; Physics MP0IB \n24590 Plant Sciences XW2NZ \n... ... ... \n30650 Computer Science; Engineering; Telecommunications HK7MI \n31337 Computer Science; Engineering; Telecommunications GI4BW \n4273 Science & Technology - Other Topics; Engineeri... LT6HI \n6761 Computer Science; Engineering BI1OF \n41900 Computer Science; Engineering 4X1MI \n\n Pubmed Id Open Access Designations \n20209 NaN Green Accepted \\\n9308 NaN Green Submitted \n26162 NaN Green Submitted, Green Accepted \n9129 32668948.0 Green Submitted, Green Published, Green Accepted \n24590 34699639.0 NaN \n... ... ... \n30650 NaN NaN \n31337 NaN NaN \n4273 NaN Green Accepted \n6761 NaN NaN \n41900 NaN NaN \n\n Highly Cited Status Hot Paper Status Date of Export UT (Unique WOS ID) \n20209 NaN NaN 2023-04-28 WOS:000601212800015 \n9308 NaN NaN 2023-04-28 WOS:000638400900003 \n26162 NaN NaN 2023-04-28 WOS:000739051800001 \n9129 Y N 2023-04-28 WOS:000551896400001 \n24590 NaN NaN 2023-04-28 WOS:000716543700001 \n... ... ... ... ... \n30650 NaN NaN 2023-04-28 WOS:000458172200013 \n31337 Y N 2023-04-28 WOS:000434316900012 \n4273 NaN NaN 2023-04-28 WOS:000537169400007 \n6761 NaN NaN 2023-04-28 WOS:000406539900055 \n41900 NaN NaN 2023-04-28 WOS:000860613700012 \n\n[100 rows x 71 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Publication Type</th>\n <th>Authors</th>\n <th>Book Authors</th>\n <th>Book Editors</th>\n <th>Book Group Authors</th>\n <th>Author Full Names</th>\n <th>Book Author Full Names</th>\n <th>Group Authors</th>\n <th>Article Title</th>\n <th>Source Title</th>\n <th>...</th>\n <th>WoS Categories</th>\n <th>Web of Science Index</th>\n <th>Research Areas</th>\n <th>IDS Number</th>\n <th>Pubmed Id</th>\n <th>Open Access Designations</th>\n <th>Highly Cited Status</th>\n <th>Hot Paper Status</th>\n <th>Date of Export</th>\n <th>UT (Unique WOS ID)</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>20209</th>\n <td>J</td>\n <td>Shuang, K; Gu, MY; Li, R; Loo, J; Su, S</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Shuang, Kai; Gu, Mengyu; Li, Rui; Loo, Jonatha...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Interactive POS-aware network for aspect-level...</td>\n <td>NEUROCOMPUTING</td>\n <td>...</td>\n <td>Computer Science, Artificial Intelligence</td>\n <td>Science Citation Index Expanded (SCI-EXPANDED)</td>\n <td>Computer Science</td>\n <td>PI6QI</td>\n <td>NaN</td>\n <td>Green Accepted</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>2023-04-28</td>\n <td>WOS:000601212800015</td>\n </tr>\n <tr>\n <th>9308</th>\n <td>J</td>\n <td>Kuo, CY; Schaarschmidt, A; Cui, YD; Asfour, T;...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Kuo, Cheng-Yu; Schaarschmidt, Andreas; Cui, Yu...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Uncertainty-Aware Contact-Safe Model-Based Rei...</td>\n <td>IEEE ROBOTICS AND AUTOMATION LETTERS</td>\n <td>...</td>\n <td>Robotics</td>\n <td>Science Citation Index Expanded (SCI-EXPANDED)</td>\n <td>Robotics</td>\n <td>RK6JU</td>\n <td>NaN</td>\n <td>Green Submitted</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>2023-04-28</td>\n <td>WOS:000638400900003</td>\n </tr>\n <tr>\n <th>26162</th>\n <td>J</td>\n <td>Jin, BT; Zhou, ZH; Zou, J</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Jin, Bangti; Zhou, Zehui; Zou, Jun</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>An analysis of stochastic variance reduced gra...</td>\n <td>INVERSE PROBLEMS</td>\n <td>...</td>\n <td>Mathematics, Applied; Physics, Mathematical</td>\n <td>Science Citation Index Expanded (SCI-EXPANDED)</td>\n <td>Mathematics; Physics</td>\n <td>YB5KT</td>\n <td>NaN</td>\n <td>Green Submitted, Green Accepted</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>2023-04-28</td>\n <td>WOS:000739051800001</td>\n </tr>\n <tr>\n <th>9129</th>\n <td>J</td>\n <td>Sun, QM; Zhang, X; Banerjee, S; Bao, P; Barbry...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Sun, Qiming; Zhang, Xing; Banerjee, Samragni; ...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Recent developments in the PySCF program package</td>\n <td>JOURNAL OF CHEMICAL PHYSICS</td>\n <td>...</td>\n <td>Chemistry, Physical; Physics, Atomic, Molecula...</td>\n <td>Science Citation Index Expanded (SCI-EXPANDED)</td>\n <td>Chemistry; Physics</td>\n <td>MP0IB</td>\n <td>32668948.0</td>\n <td>Green Submitted, Green Published, Green Accepted</td>\n <td>Y</td>\n <td>N</td>\n <td>2023-04-28</td>\n <td>WOS:000551896400001</td>\n </tr>\n <tr>\n <th>24590</th>\n <td>J</td>\n <td>Jiang, XH; Zhang, WY; Fernie, AR; Wen, WW</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Jiang, Xiaohui; Zhang, Weiyi; Fernie, Alisdair...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Combining novel technologies with interdiscipl...</td>\n <td>PLANT JOURNAL</td>\n <td>...</td>\n <td>Plant Sciences</td>\n <td>Science Citation Index Expanded (SCI-EXPANDED)</td>\n <td>Plant Sciences</td>\n <td>XW2NZ</td>\n <td>34699639.0</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>2023-04-28</td>\n <td>WOS:000716543700001</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>30650</th>\n <td>J</td>\n <td>Guan, ZT; Zhang, Y; Si, GL; Zhou, ZY; Wu, J; M...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Guan, Zhitao; Zhang, Yue; Si, Guanlin; Zhou, Z...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>ECOSECURITY: Tackling Challenges Related to Da...</td>\n <td>IEEE CONSUMER ELECTRONICS MAGAZINE</td>\n <td>...</td>\n <td>Computer Science, Hardware &amp; Architecture; Eng...</td>\n <td>Science Citation Index Expanded (SCI-EXPANDED)</td>\n <td>Computer Science; Engineering; Telecommunications</td>\n <td>HK7MI</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>2023-04-28</td>\n <td>WOS:000458172200013</td>\n </tr>\n <tr>\n <th>31337</th>\n <td>J</td>\n <td>Liu, H; Zhang, Y; Yang, T</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Liu, Hong; Zhang, Yan; Yang, Tao</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Blockchain-Enabled Security in Electric Vehicl...</td>\n <td>IEEE NETWORK</td>\n <td>...</td>\n <td>Computer Science, Hardware &amp; Architecture; Com...</td>\n <td>Science Citation Index Expanded (SCI-EXPANDED)</td>\n <td>Computer Science; Engineering; Telecommunications</td>\n <td>GI4BW</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Y</td>\n <td>N</td>\n <td>2023-04-28</td>\n <td>WOS:000434316900012</td>\n </tr>\n <tr>\n <th>4273</th>\n <td>J</td>\n <td>Zhou, YL; Chang, FJ; Chen, H; Li, H</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Zhou, Yanlai; Chang, Fi-John; Chen, Hua; Li, Hong</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Exploring Copula-based Bayesian Model Averagin...</td>\n <td>JOURNAL OF CLEANER PRODUCTION</td>\n <td>...</td>\n <td>Green &amp; Sustainable Science &amp; Technology; Engi...</td>\n <td>Science Citation Index Expanded (SCI-EXPANDED)</td>\n <td>Science &amp; Technology - Other Topics; Engineeri...</td>\n <td>LT6HI</td>\n <td>NaN</td>\n <td>Green Accepted</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>2023-04-28</td>\n <td>WOS:000537169400007</td>\n </tr>\n <tr>\n <th>6761</th>\n <td>C</td>\n <td>Li, Y; Tao, JH; Schuller, B; Shan, SG; Jiang, ...</td>\n <td>NaN</td>\n <td>Tan, T; Li, X; Chen, X; Zhou, J; Yang, J; Chen...</td>\n <td>NaN</td>\n <td>Li, Ya; Tao, Jianhua; Schuller, Bjoern; Shan, ...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>MEC 2016: The Multimodal Emotion Recognition C...</td>\n <td>PATTERN RECOGNITION (CCPR 2016), PT II</td>\n <td>...</td>\n <td>Computer Science, Artificial Intelligence; Com...</td>\n <td>Conference Proceedings Citation Index - Scienc...</td>\n <td>Computer Science; Engineering</td>\n <td>BI1OF</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>2023-04-28</td>\n <td>WOS:000406539900055</td>\n </tr>\n <tr>\n <th>41900</th>\n <td>J</td>\n <td>Shang, RH; Kong, JR; Zhang, WT; Feng, J; Jiao,...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Shang, Ronghua; Kong, Jiarui; Zhang, Weitong; ...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Uncorrelated feature selection via sparse late...</td>\n <td>PATTERN RECOGNITION</td>\n <td>...</td>\n <td>Computer Science, Artificial Intelligence; Eng...</td>\n <td>Science Citation Index Expanded (SCI-EXPANDED)</td>\n <td>Computer Science; Engineering</td>\n <td>4X1MI</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>2023-04-28</td>\n <td>WOS:000860613700012</td>\n </tr>\n </tbody>\n</table>\n<p>100 rows × 71 columns</p>\n</div>"
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos = pd.read_csv(outfile, sep=\"\\t\",low_memory=False)\n",
"wos.sample(100)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of initial (valid interval) records: 56196\n",
"Number of METRIX filtered records: 49854\n",
"Number of unindexed records: 2984\n",
"Number of filtered records (dropping duplicates): 49839\n"
]
}
],
"source": [
"\n",
"wos = wos[((wos[\"Publication Year\"]<2023)&(wos[\"Publication Year\"]>2010))].copy()\n",
"print(f'Number of initial (valid interval) records: {len(wos)}')\n",
"\n",
"metrix = pd.read_excel(\"sm_journal_classification.xlsx\", sheet_name=\"Journal_Classification\")\n",
"\n",
"\n",
"metrix = metrix.set_index([c for c in metrix.columns if \"issn\" not in c]).stack().reset_index()\n",
"metrix = metrix.rename(columns={'level_6':\"issn_type\", 0:\"issn\"})\n",
"metrix[\"issn\"]=metrix[\"issn\"].str.replace(\"-\",\"\").str.lower().str.strip()\n",
"\n",
"wos[\"issn\"] = wos[\"ISSN\"].str.replace(\"-\",\"\").str.lower().str.strip()\n",
"wos[\"eissn\"] = wos[\"eISSN\"].str.replace(\"-\",\"\").str.lower().str.strip()\n",
"wos = wos.set_index([c for c in wos.columns if \"issn\" not in c]).stack().reset_index()\n",
"wos = wos.rename(columns={'level_71':\"issn_var\", 0:\"issn\"})\n",
"\n",
"wos_merge = wos.merge(metrix, on=\"issn\", how=\"left\")\n",
"\n",
"\n",
"\n",
"wos_indexed = wos_merge[~wos_merge[\"Domain_English\"].isna()]\n",
"wos_unindexed = wos_merge[~wos_merge[record_col].isin(wos_indexed[record_col])]\n",
"\n",
"\n",
"wos_unindexed = wos_unindexed.sort_values(by=[\"issn_var\"],ascending=False).drop_duplicates(subset=record_col)\n",
"wos = wos_indexed.sort_values(by=[\"issn_var\"],ascending=False).drop_duplicates(subset=record_col)\n",
"\n",
"wos_postmerge = wos.copy()\n",
"print(f'Number of METRIX filtered records: {len(wos)}')\n",
"print(f'Number of unindexed records: {len(wos_unindexed)}')\n",
"\n",
"# drop entries not indexed by metrix\n",
"# drop duplicates (based on doi)\n",
"wos = wos[~((~wos[\"DOI\"].isna())&(wos[\"DOI\"].duplicated(False)))]\n",
"wos = wos.drop_duplicates(subset=[\"Publication Type\",\"Document Type\",\"Authors\",\"Article Title\",\"Source Title\",\"Publication Year\"])\n",
"print(f'Number of filtered records (dropping duplicates): {len(wos)}')"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 5,
"outputs": [
{
"data": {
"text/plain": "Domain_English\nApplied Sciences 31871\nNatural Sciences 9542\nHealth Sciences 5942\nEconomic & Social Sciences 1468\narticle-level classification 940\nArts & Humanities 76\nName: count, dtype: int64"
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos[\"Domain_English\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 6,
"outputs": [],
"source": [
"wos_classifier = wos[[\"WoS Categories\",\"Research Areas\"]+list(metrix.columns)].copy().drop_duplicates()\n",
"wos_classifier = wos_classifier.groupby([\"WoS Categories\",\"Research Areas\"], as_index=False)[[\"Domain_English\",\"Field_English\",\"SubField_English\"]].agg(\n",
" lambda x: pd.Series.mode(x)[0])"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 7,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found: 2065 \n",
"Lost forever: 919\n"
]
}
],
"source": [
"wos_to_reindex = wos_unindexed.drop(columns=list(metrix.columns))\n",
"wos_found = wos_to_reindex.merge(wos_classifier, on=[\"WoS Categories\",\"Research Areas\"], how=\"inner\")\n",
"# wos_found = wos_to_reindex.merge(wos_classifier, on=\"Research Areas\", how=\"inner\")\n",
"# # wos_found = wos_to_reindex.merge(wos_classifier, on=\"WoS Categories\", how=\"inner\")\n",
"wos_stillost = wos_unindexed[~wos_unindexed[record_col].isin(wos_found[record_col])]\n",
"\n",
"print(\"Found:\", wos_found[record_col].nunique(),\"\\nLost forever:\", wos_stillost[record_col].nunique())"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 8,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of records (after remerge): 51904\n"
]
}
],
"source": [
"wos = pd.concat([wos,wos_found], ignore_index=True)\n",
"print(f'Number of records (after remerge): {len(wos)}')"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 9,
"outputs": [
{
"data": {
"text/plain": "Domain_English\nApplied Sciences 33720\nNatural Sciences 9617\nHealth Sciences 6002\nEconomic & Social Sciences 1533\narticle-level classification 955\nArts & Humanities 77\nName: count, dtype: int64"
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos[\"Domain_English\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 10,
"outputs": [
{
"data": {
"text/plain": "WoS Categories\nEngineering, Electrical & Electronic 13661\nComputer Science, Artificial Intelligence 7760\nComputer Science, Information Systems 6481\nTelecommunications 5560\nComputer Science, Theory & Methods 3597\n ... \nMusic 1\nCultural Studies 1\nPsychology, Psychoanalysis 1\nAsian Studies 1\nAndrology 1\nName: count, Length: 236, dtype: int64"
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos_cat = wos.groupby(record_col)[\"WoS Categories\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
"wos_cat[\"WoS Categories\"] = wos_cat[\"WoS Categories\"].str.strip()\n",
"wos_cat[\"WoS Categories\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 11,
"outputs": [
{
"data": {
"text/plain": "WoS Category\nEngineering 20126\nComputer Science 17613\nTelecommunications 5560\nImaging Science & Photographic Technology 3295\nAutomation & Control Systems 3232\n ... \nMusic 1\nAndrology 1\nLiterature 1\nCultural Studies 1\nAsian Studies 1\nName: count, Length: 177, dtype: int64"
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos_subcat = wos_cat.copy()\n",
"wos_subcat[['WoS Category', 'WoS SubCategory']] = wos_subcat[\"WoS Categories\"].str.split(\",\", expand = True, n=1)\n",
"for c in ['WoS Category', 'WoS SubCategory',\"WoS Categories\"]:\n",
" wos_subcat[c] = wos_subcat[c].str.strip()\n",
"wos_subcat.drop_duplicates(subset=[record_col,'WoS Category'])[\"WoS Category\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [
{
"data": {
"text/plain": "Research Areas\nEngineering 20176\nComputer Science 17613\nTelecommunications 5560\nEnvironmental Sciences & Ecology 3732\nImaging Science & Photographic Technology 3295\n ... \nLiterature 1\nWomen's Studies 1\nCultural Studies 1\nAsian Studies 1\nMusic 1\nName: count, Length: 147, dtype: int64"
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos_areas = wos.groupby(record_col)[\"Research Areas\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
"wos_areas[\"Research Areas\"] = wos_areas[\"Research Areas\"].str.strip()\n",
"wos_areas[\"Research Areas\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 13,
"outputs": [
{
"data": {
"text/plain": " Article Title \n24862 Kinematic self-calibration of non-contact five... \\\n6623 Optimizing Color Assignment for Perception of ... \n20728 CFD modeling of biomass combustion and gasific... \n41245 Redshift-space distortions in f(R) gravity \n12373 Executable Knowledge Graphs for Machine Learni... \n... ... \n11117 Biochar amendment mitigated N2O emissions from... \n47975 Adaptive Noise Reduction for Sound Event Detec... \n4599 NVM Storage in IoT Devices: Opportunities and ... \n40609 FABNet: Fusion Attention Block and Transfer Le... \n45199 Tea Category Identification Using a Novel Frac... \n\n Keywords Plus \n24862 POSE MEASUREMENT; PARALLEL; MANIPULATOR \\\n6623 OPTIMIZATION; DIFFERENCE \n20728 DISCRETE PARTICLE SIMULATION; STEAM GASIFICATI... \n41245 DARK ENERGY SURVEY; REAL-SPACE; GROWTH; DENSIT... \n12373 NaN \n... ... \n11117 NITROUS-OXIDE EMISSIONS; NITRIFIER DENITRIFICA... \n47975 NONNEGATIVE MATRIX FACTORIZATION; SOURCE SEPAR... \n4599 ENCRYPTED DATA; ACCESS-CONTROL; INTERNET; MEMO... \n40609 NUCLEI \n45199 LEARNING-BASED OPTIMIZATION; PATHOLOGICAL BRAI... \n\n Author Keywords \n24862 kinematic self-calibration; five-axis measurin... \n6623 Color perception; visual design; scatterplots \n20728 Biomass combustion and gasification; CFD simul... \n41245 cosmology: theory; dark energy; large-scale st... \n12373 Knowledge graph; Machine learning; Data analyt... \n... ... \n11117 Biochar; Nitrite accumulation; Nitrous oxide; ... \n47975 sound event detection; non-stationary noise; w... \n4599 IoT; NVM; storage system; energy efficiency; s... \n40609 Cancer; Analytical models; Transfer learning; ... \n45199 tea-category identification; fractional Fourie... \n\n[100 rows x 3 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Article Title</th>\n <th>Keywords Plus</th>\n <th>Author Keywords</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>24862</th>\n <td>Kinematic self-calibration of non-contact five...</td>\n <td>POSE MEASUREMENT; PARALLEL; MANIPULATOR</td>\n <td>kinematic self-calibration; five-axis measurin...</td>\n </tr>\n <tr>\n <th>6623</th>\n <td>Optimizing Color Assignment for Perception of ...</td>\n <td>OPTIMIZATION; DIFFERENCE</td>\n <td>Color perception; visual design; scatterplots</td>\n </tr>\n <tr>\n <th>20728</th>\n <td>CFD modeling of biomass combustion and gasific...</td>\n <td>DISCRETE PARTICLE SIMULATION; STEAM GASIFICATI...</td>\n <td>Biomass combustion and gasification; CFD simul...</td>\n </tr>\n <tr>\n <th>41245</th>\n <td>Redshift-space distortions in f(R) gravity</td>\n <td>DARK ENERGY SURVEY; REAL-SPACE; GROWTH; DENSIT...</td>\n <td>cosmology: theory; dark energy; large-scale st...</td>\n </tr>\n <tr>\n <th>12373</th>\n <td>Executable Knowledge Graphs for Machine Learni...</td>\n <td>NaN</td>\n <td>Knowledge graph; Machine learning; Data analyt...</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>11117</th>\n <td>Biochar amendment mitigated N2O emissions from...</td>\n <td>NITROUS-OXIDE EMISSIONS; NITRIFIER DENITRIFICA...</td>\n <td>Biochar; Nitrite accumulation; Nitrous oxide; ...</td>\n </tr>\n <tr>\n <th>47975</th>\n <td>Adaptive Noise Reduction for Sound Event Detec...</td>\n <td>NONNEGATIVE MATRIX FACTORIZATION; SOURCE SEPAR...</td>\n <td>sound event detection; non-stationary noise; w...</td>\n </tr>\n <tr>\n <th>4599</th>\n <td>NVM Storage in IoT Devices: Opportunities and ...</td>\n <td>ENCRYPTED DATA; ACCESS-CONTROL; INTERNET; MEMO...</td>\n <td>IoT; NVM; storage system; energy efficiency; s...</td>\n </tr>\n <tr>\n <th>40609</th>\n <td>FABNet: Fusion Attention Block and Transfer Le...</td>\n <td>NUCLEI</td>\n <td>Cancer; Analytical models; Transfer learning; ...</td>\n </tr>\n <tr>\n <th>45199</th>\n <td>Tea Category Identification Using a Novel Frac...</td>\n <td>LEARNING-BASED OPTIMIZATION; PATHOLOGICAL BRAI...</td>\n <td>tea-category identification; fractional Fourie...</td>\n </tr>\n </tbody>\n</table>\n<p>100 rows × 3 columns</p>\n</div>"
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos[[\"Article Title\",\"Keywords Plus\",\"Author Keywords\"]].sample(100)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 14,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) keyword_all\n0 WOS:000208837000001 NANOINDENTATION\n1 WOS:000208837000001 HARDNESS\n2 WOS:000208837000001 PLASMA-SPRAYED COATING\n3 WOS:000208837000001 INVERSE ANALYSIS\n4 WOS:000208837000001 NUMERICAL METHOD\n.. ... ...\n97 WOS:000209571700012 PERSONALIZED MEDICINE\n98 WOS:000209571700012 COMPLEX NETWORK\n99 WOS:000209571700012 CLINICAL PHENOTYPE NETWORK\n100 WOS:000209571700012 TRADITIONAL CHINESE MEDICINE\n101 WOS:000209617200002 PHYLLOSCOPIDAE\n\n[100 rows x 2 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>keyword_all</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000208837000001</td>\n <td>NANOINDENTATION</td>\n </tr>\n <tr>\n <th>1</th>\n <td>WOS:000208837000001</td>\n <td>HARDNESS</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000208837000001</td>\n <td>PLASMA-SPRAYED COATING</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000208837000001</td>\n <td>INVERSE ANALYSIS</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000208837000001</td>\n <td>NUMERICAL METHOD</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>97</th>\n <td>WOS:000209571700012</td>\n <td>PERSONALIZED MEDICINE</td>\n </tr>\n <tr>\n <th>98</th>\n <td>WOS:000209571700012</td>\n <td>COMPLEX NETWORK</td>\n </tr>\n <tr>\n <th>99</th>\n <td>WOS:000209571700012</td>\n <td>CLINICAL PHENOTYPE NETWORK</td>\n </tr>\n <tr>\n <th>100</th>\n <td>WOS:000209571700012</td>\n <td>TRADITIONAL CHINESE MEDICINE</td>\n </tr>\n <tr>\n <th>101</th>\n <td>WOS:000209617200002</td>\n <td>PHYLLOSCOPIDAE</td>\n </tr>\n </tbody>\n</table>\n<p>100 rows × 2 columns</p>\n</div>"
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kw_df = pd.DataFrame()\n",
"for c in [\"Keywords Plus\",\"Author Keywords\"]:\n",
" kwp = wos.groupby(record_col)[c].apply(lambda x: x.str.split(';')).explode().str.strip().str.upper()\n",
" kwp.name = 'keyword_all'\n",
" kw_df = pd.concat([kwp.reset_index(),kw_df],ignore_index=True)\n",
"kw_df = kw_df[~kw_df[\"keyword_all\"].isna()].copy().drop(columns=\"level_1\").drop_duplicates()\n",
"kw_df[\"keyword_all\"] = kw_df[\"keyword_all\"].apply(lambda x: re.sub(\"[\\(\\[].*?[\\)\\]]\", \"\", x))\n",
"kw_df.head(100)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 15,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) keyword_all\n0 WOS:000208837000001 NANOINDENTATION; HARDNESS; PLASMA-SPRAYED COAT...\n1 WOS:000208863600013 COMPARATIVE GENOMICS; ANAMMOX; KUENENIA STUTTG...\n2 WOS:000208863600266 ANME; PYROSEQUENCING; AOM; COMMUNITY STRUCTURE...\n3 WOS:000208863900217 DEFAULT MODE NETWORK; EFFECTIVE CONNECTIVITY; ...\n4 WOS:000208935500007 ESTIMATION; SOURCE TERM; QUASI STEADY; THE ITE...",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>keyword_all</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000208837000001</td>\n <td>NANOINDENTATION; HARDNESS; PLASMA-SPRAYED COAT...</td>\n </tr>\n <tr>\n <th>1</th>\n <td>WOS:000208863600013</td>\n <td>COMPARATIVE GENOMICS; ANAMMOX; KUENENIA STUTTG...</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000208863600266</td>\n <td>ANME; PYROSEQUENCING; AOM; COMMUNITY STRUCTURE...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000208863900217</td>\n <td>DEFAULT MODE NETWORK; EFFECTIVE CONNECTIVITY; ...</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000208935500007</td>\n <td>ESTIMATION; SOURCE TERM; QUASI STEADY; THE ITE...</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos_kwd_concat = kw_df.groupby(record_col, as_index=False).agg({'keyword_all': '; '.join})\n",
"wos_kwd_concat.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 15,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 16,
"outputs": [
{
"data": {
"text/plain": "Index(['Publication Type', 'Authors', 'Book Authors', 'Book Editors',\n 'Book Group Authors', 'Author Full Names', 'Book Author Full Names',\n 'Group Authors', 'Article Title', 'Source Title', 'Book Series Title',\n 'Book Series Subtitle', 'Language', 'Document Type', 'Conference Title',\n 'Conference Date', 'Conference Location', 'Conference Sponsor',\n 'Conference Host', 'Author Keywords', 'Keywords Plus', 'Abstract',\n 'Addresses', 'Affiliations', 'Reprint Addresses', 'Email Addresses',\n 'Researcher Ids', 'ORCIDs', 'Funding Orgs', 'Funding Name Preferred',\n 'Funding Text', 'Cited References', 'Cited Reference Count',\n 'Times Cited, WoS Core', 'Times Cited, All Databases',\n '180 Day Usage Count', 'Since 2013 Usage Count', 'Publisher',\n 'Publisher City', 'Publisher Address', 'ISSN', 'eISSN', 'ISBN',\n 'Journal Abbreviation', 'Journal ISO Abbreviation', 'Publication Date',\n 'Publication Year', 'Volume', 'Issue', 'Part Number', 'Supplement',\n 'Special Issue', 'Meeting Abstract', 'Start Page', 'End Page',\n 'Article Number', 'DOI', 'DOI Link', 'Book DOI', 'Early Access Date',\n 'Number of Pages', 'WoS Categories', 'Web of Science Index',\n 'Research Areas', 'IDS Number', 'Pubmed Id', 'Open Access Designations',\n 'Highly Cited Status', 'Hot Paper Status', 'Date of Export',\n 'UT (Unique WOS ID)', 'issn_var', 'issn', 'Domain_English',\n 'Field_English', 'SubField_English', '2.00 SEQ', 'Source_title',\n 'srcid', 'issn_type'],\n dtype='object')"
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos.columns"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"geotext = GeoText()\n",
"\n",
"def extract_location(input_text, key='countries'):\n",
" anomalies = {\"Malta\":\"Malta\",\n",
" \"Mongolia\":\"Mongolia\",\n",
" \"Quatar\":\"Qatar\",\n",
" \"Qatar\":\"Qatar\",\n",
" \"Ethiop\":\"Ethiopia\",\n",
" \"Nigeria\":\"Nigeria\",\n",
" \"BELAR\":\"Belarus\",\n",
" \"Venezuela\":\"Venezuela\",\n",
" \"Cyprus\":\"Cyprus\",\n",
" \"Ecuador\":\"Ecuador\",\n",
" \"U Arab\":\"United Arab Emirates\",\n",
" \"Syria\":\"Syria\",\n",
" \"Uganda\":\"Uganda\",\n",
" \"Yemen\":\"Yemen\",\n",
" \"Mali\":\"Mali\",\n",
" \"Senegal\":\"Senegal\",\n",
" \"Vatican\":\"Vatican\",\n",
" \"Uruguay\":\"Uruguay\",\n",
" \"Panama\":\"Panama\",\n",
" \"Fiji\":\"Fiji\",\n",
" \"Faroe\":\"Faroe Islands\",\n",
" \"Macedonia\":\"Macedonia\",\n",
" 'Mozambique':'Mozambique',\n",
" \"Kuwait\":\"Kuwait\",\n",
" \"Libya\":\"Libya\",\n",
" \"Turkiy\":\"Turkey\",\n",
" \"Liberia\":\"Liberia\",\n",
" \"Namibia\":\"Namibia\",\n",
" \"Ivoire\":\"Ivory Coast\",\n",
" \"Guatemala\":\"Gutemala\",\n",
" \"Paraguay\":\"Paraguay\",\n",
" \"Honduras\":\"Honduras\",\n",
" \"Nicaragua\":\"Nicaragua\",\n",
" \"Trinidad\":\"Trinidad & Tobago\",\n",
" \"Liechtenstein\":\"Liechtenstein\",\n",
" \"Greenland\":\"Denmark\"}\n",
"\n",
" extracted = geotext.extract(input_text=input_text)\n",
" found = extracted[key].keys()\n",
" if len(sorted(found))>0:\n",
" return sorted(found)[0]\n",
" elif key=='countries':\n",
" for i in ['Scotland','Wales','England', 'N Ireland']:\n",
" if i in input_text:\n",
" return 'United Kingdom'\n",
" for j in anomalies.keys():\n",
" if j in input_text:\n",
" return anomalies.get(j)\n",
" else:\n",
" return None\n",
"\n",
"with open('../eu_members.txt',\"r\") as f:\n",
" eu_countries=f.readline().split(\",\")\n",
" eu_countries=[i.strip() for i in eu_countries]\n",
"\n",
"def country_cleanup(country):\n",
" if \"USA\" in country:\n",
" return \"USA\"\n",
" elif \"China\" in country:\n",
" return \"China\"\n",
" elif country in [\"England\", \"Northern Ireland\", \"Wales\", \"Scotland\",\"N Ireland\"]:\n",
" return \"United Kingdom\"\n",
" else:\n",
" return country\n",
"\n",
"\n",
"def country_type(country):\n",
" if country in eu_countries:\n",
" return \"EU\"\n",
" elif country==\"China\":\n",
" return \"China\"\n",
" elif country in [\"Switzerland\", 'Norway','United Kingdom']:\n",
" return \"Non-EU associate\"\n",
" else:\n",
" return \"Other\"\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"locations = wos.groupby(record_col)[\"Addresses\"].apply(lambda x: x.str.split('[')).explode().reset_index().drop(columns=\"level_1\")\n",
"\n",
"\n",
"locations = locations[locations[\"Addresses\"]!=\"\"].copy()\n",
"locations[\"Address\"] = locations[\"Addresses\"].apply(lambda x:x.split(\"]\")[-1])\n",
"locations[\"Authors_of_address\"] = locations[\"Addresses\"].apply(lambda x:x.split(\"]\")[0])"
]
},
{
"cell_type": "code",
"execution_count": 19,
"outputs": [
{
"data": {
"text/plain": "312820"
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(locations)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 20,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Authors_of_address \n0 WOS:000208837000001 Gitzhofer, Francois \\\n1 WOS:000208837000001 Guo, Wei-Chao; Papeleux, Luc; Ponthot, Jean-Ph... \n2 WOS:000208837000001 Guo, Wei-Chao; Zhang, Wei-Hong \n3 WOS:000208837000001 Rauchs, Gast \n4 WOS:000208863600013 Hu, Baolan \n.. ... ... \n95 WOS:000209546000001 Salahuddin, Nawal \n96 WOS:000209546000001 Shrestha, Babu Raja \n97 WOS:000209546000001 Tan, Cheng Cheng \n98 WOS:000209546000001 Tang, Yao-Qing \n99 WOS:000209546000001 Tu, Mei-Lien \n\n Address \n0 Univ Sherbrooke, Dept Chem Engn, Plasma Techno... \n1 Univ Liege, Aerosp & Mech Engn Dept, LTAS MN2L... \n2 Northwestern Polytech Univ, Key Lab Contempora... \n3 Ctr Rech Publ Henri Tudor, Dept Adv Mat & Stru... \n4 Zhejiang Univ, Dept Environm Engn, Hangzhou 31... \n.. ... \n95 Aga Khan Univ & Hosp, Dept Med, Pulm & Crit Ca... \n96 Kathmandu Med Coll Teaching Hosp, Dept Anesthe... \n97 Sultanah Aminah Hosp, Dept Anaesthesia & Inten... \n98 Shanghai Jiao Tong Univ, Sch Med, Ruijin Hosp,... \n99 Chang Gung Mem Hosp, Kaohsiung Med Ctr, Dept R... \n\n[100 rows x 3 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Authors_of_address</th>\n <th>Address</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000208837000001</td>\n <td>Gitzhofer, Francois</td>\n <td>Univ Sherbrooke, Dept Chem Engn, Plasma Techno...</td>\n </tr>\n <tr>\n <th>1</th>\n <td>WOS:000208837000001</td>\n <td>Guo, Wei-Chao; Papeleux, Luc; Ponthot, Jean-Ph...</td>\n <td>Univ Liege, Aerosp &amp; Mech Engn Dept, LTAS MN2L...</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000208837000001</td>\n <td>Guo, Wei-Chao; Zhang, Wei-Hong</td>\n <td>Northwestern Polytech Univ, Key Lab Contempora...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000208837000001</td>\n <td>Rauchs, Gast</td>\n <td>Ctr Rech Publ Henri Tudor, Dept Adv Mat &amp; Stru...</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000208863600013</td>\n <td>Hu, Baolan</td>\n <td>Zhejiang Univ, Dept Environm Engn, Hangzhou 31...</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>95</th>\n <td>WOS:000209546000001</td>\n <td>Salahuddin, Nawal</td>\n <td>Aga Khan Univ &amp; Hosp, Dept Med, Pulm &amp; Crit Ca...</td>\n </tr>\n <tr>\n <th>96</th>\n <td>WOS:000209546000001</td>\n <td>Shrestha, Babu Raja</td>\n <td>Kathmandu Med Coll Teaching Hosp, Dept Anesthe...</td>\n </tr>\n <tr>\n <th>97</th>\n <td>WOS:000209546000001</td>\n <td>Tan, Cheng Cheng</td>\n <td>Sultanah Aminah Hosp, Dept Anaesthesia &amp; Inten...</td>\n </tr>\n <tr>\n <th>98</th>\n <td>WOS:000209546000001</td>\n <td>Tang, Yao-Qing</td>\n <td>Shanghai Jiao Tong Univ, Sch Med, Ruijin Hosp,...</td>\n </tr>\n <tr>\n <th>99</th>\n <td>WOS:000209546000001</td>\n <td>Tu, Mei-Lien</td>\n <td>Chang Gung Mem Hosp, Kaohsiung Med Ctr, Dept R...</td>\n </tr>\n </tbody>\n</table>\n<p>100 rows × 3 columns</p>\n</div>"
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"locations[\"Address\"] = locations[\"Address\"].str.strip().str.strip(\";\")\n",
"locations = locations.groupby([record_col,\"Authors_of_address\"])[\"Address\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_2\")\n",
"locations.head(100)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 21,
"outputs": [],
"source": [
"# import dask.dataframe as dd\n",
"#\n",
"# locations_ddf = dd.from_pandas(locations, npartitions=4) # convert pandas DataFrame to Dask DataFrame\n",
"# loc_compute = locations_ddf.groupby([record_col,\"Authors_of_address\"])[\"Address\"].apply(lambda x: x.str.split(';')).explode().compute() # compute the result"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 22,
"outputs": [],
"source": [
"# locations_test = locations.head(1000)\n",
"# locations_test = locations_test.groupby([record_col,\"Authors_of_address\"])[\"Address\"].str.split(';').explode()\n",
"# locations_test"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 23,
"outputs": [],
"source": [
"\n",
"# locations[\"Country\"]=locations['Address'].apply(lambda x: extract_location(input_text=x, key='countries'))\n",
"locations[\"Country\"]=locations['Address'].apply(lambda x: x.split(\",\")[-1].strip(\" \").strip(\";\").strip(\" \"))\n",
"locations[\"Country\"]=locations['Country'].apply(lambda x: country_cleanup(x))\n",
"locations[\"City\"]=locations['Address'].apply(lambda x: extract_location(input_text=x, key='cities'))\n",
"locations[\"Country_Type\"] = locations[\"Country\"].apply(lambda x: country_type(x))"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 23,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 24,
"outputs": [],
"source": [
"scope_types = [\"EU\",\"China\",\"Non-EU associate\"]\n",
"locations=locations[locations[\"Country_Type\"].isin(scope_types)]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Address \n1 WOS:000208837000001 Univ Liege, Aerosp & Mech Engn Dept, LTAS MN2L... \\\n2 WOS:000208837000001 Northwestern Polytech Univ, Key Lab Contempora... \n3 WOS:000208837000001 Ctr Rech Publ Henri Tudor, Dept Adv Mat & Stru... \n4 WOS:000208863600013 Zhejiang Univ, Dept Environm Engn, Hangzhou 31... \n5 WOS:000208863600013 Delft Univ Technol, Dept Biotechnol, Delft, Ne... \n\n Country City Country_Type Institution \n1 Belgium Liège EU Univ Liege \n2 China Xian China Northwestern Polytech Univ \n3 Luxembourg Luxembourg EU Ctr Rech Publ Henri Tudor \n4 China Hangzhou China Zhejiang Univ \n5 Netherlands Delft EU Delft Univ Technol ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Address</th>\n <th>Country</th>\n <th>City</th>\n <th>Country_Type</th>\n <th>Institution</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>1</th>\n <td>WOS:000208837000001</td>\n <td>Univ Liege, Aerosp &amp; Mech Engn Dept, LTAS MN2L...</td>\n <td>Belgium</td>\n <td>Liège</td>\n <td>EU</td>\n <td>Univ Liege</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000208837000001</td>\n <td>Northwestern Polytech Univ, Key Lab Contempora...</td>\n <td>China</td>\n <td>Xian</td>\n <td>China</td>\n <td>Northwestern Polytech Univ</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000208837000001</td>\n <td>Ctr Rech Publ Henri Tudor, Dept Adv Mat &amp; Stru...</td>\n <td>Luxembourg</td>\n <td>Luxembourg</td>\n <td>EU</td>\n <td>Ctr Rech Publ Henri Tudor</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000208863600013</td>\n <td>Zhejiang Univ, Dept Environm Engn, Hangzhou 31...</td>\n <td>China</td>\n <td>Hangzhou</td>\n <td>China</td>\n <td>Zhejiang Univ</td>\n </tr>\n <tr>\n <th>5</th>\n <td>WOS:000208863600013</td>\n <td>Delft Univ Technol, Dept Biotechnol, Delft, Ne...</td>\n <td>Netherlands</td>\n <td>Delft</td>\n <td>EU</td>\n <td>Delft Univ Technol</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"univ_locations = locations[[record_col,\"Address\",\"Country\",\"City\",\"Country_Type\"]].copy()\n",
"univ_locations[\"Institution\"] = univ_locations[\"Address\"].apply(lambda x: x.split(\",\")[0])\n",
"univ_locations = univ_locations.drop_duplicates()\n",
"univ_locations.head()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Country Country_Type author_str_id\n0 WOS:000208837000001 Belgium EU 6079964a4094c607358a130e41e89f90\n1 WOS:000208837000001 Belgium EU 2321037fa90ac94a23b88a79f1c7f454\n2 WOS:000208837000001 Belgium EU 8a1bfa1e7bc52d323f0d9c23a9b74ed3\n3 WOS:000208837000001 China China 6079964a4094c607358a130e41e89f90\n4 WOS:000208837000001 China China 17fb036de6a4db3ba39ccab3d8307c04",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Country</th>\n <th>Country_Type</th>\n <th>author_str_id</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000208837000001</td>\n <td>Belgium</td>\n <td>EU</td>\n <td>6079964a4094c607358a130e41e89f90</td>\n </tr>\n <tr>\n <th>1</th>\n <td>WOS:000208837000001</td>\n <td>Belgium</td>\n <td>EU</td>\n <td>2321037fa90ac94a23b88a79f1c7f454</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000208837000001</td>\n <td>Belgium</td>\n <td>EU</td>\n <td>8a1bfa1e7bc52d323f0d9c23a9b74ed3</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000208837000001</td>\n <td>China</td>\n <td>China</td>\n <td>6079964a4094c607358a130e41e89f90</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000208837000001</td>\n <td>China</td>\n <td>China</td>\n <td>17fb036de6a4db3ba39ccab3d8307c04</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"author_locations = locations.groupby([record_col,\"Country\",\"Country_Type\"])[\"Authors_of_address\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_3\")\n",
"author_locations[\"Author_name\"] = author_locations[\"Authors_of_address\"].str.strip()\n",
"author_locations = author_locations.drop(columns=\"Authors_of_address\")\n",
"author_locations[\"author_str_id\"] = author_locations[\"Author_name\"].apply(lambda x:''.join(filter(str.isalnum, x.lower())))\n",
"author_locations[\"author_str_id\"] = author_locations[\"author_str_id\"].apply(md5hash)\n",
"author_locations = author_locations.drop(columns=\"Author_name\")\n",
"author_locations.head()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Country Country_Type \n0 WOS:000208837000001 Belgium EU \\\n3 WOS:000208837000001 China China \n4 WOS:000208837000001 China China \n6 WOS:000208863600013 China China \n7 WOS:000208863600013 Netherlands EU \n... ... ... ... \n643323 WOS:000964683900016 Italy EU \n643324 WOS:000964683900016 Italy EU \n643325 WOS:000967389100001 China China \n643326 WOS:000967389100001 Norway Non-EU associate \n643327 WOS:000967389100001 Norway Non-EU associate \n\n author_str_id \n0 6079964a4094c607358a130e41e89f90 \n3 6079964a4094c607358a130e41e89f90 \n4 17fb036de6a4db3ba39ccab3d8307c04 \n6 54c7bc6fe9b77434ca1bf04d763d843b \n7 df81f9da6c8f5c968c16ef0aab1bb8f9 \n... ... \n643323 3c631398a81ab7058d95a0c6418a2c0b \n643324 3c631398a81ab7058d95a0c6418a2c0b \n643325 ce65541a6c334225a9617439f4a95012 \n643326 7c52a53f8d79b1ffd4f2e4cde9548e1d \n643327 7c52a53f8d79b1ffd4f2e4cde9548e1d \n\n[573569 rows x 4 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Country</th>\n <th>Country_Type</th>\n <th>author_str_id</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000208837000001</td>\n <td>Belgium</td>\n <td>EU</td>\n <td>6079964a4094c607358a130e41e89f90</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000208837000001</td>\n <td>China</td>\n <td>China</td>\n <td>6079964a4094c607358a130e41e89f90</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000208837000001</td>\n <td>China</td>\n <td>China</td>\n <td>17fb036de6a4db3ba39ccab3d8307c04</td>\n </tr>\n <tr>\n <th>6</th>\n <td>WOS:000208863600013</td>\n <td>China</td>\n <td>China</td>\n <td>54c7bc6fe9b77434ca1bf04d763d843b</td>\n </tr>\n <tr>\n <th>7</th>\n <td>WOS:000208863600013</td>\n <td>Netherlands</td>\n <td>EU</td>\n <td>df81f9da6c8f5c968c16ef0aab1bb8f9</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>643323</th>\n <td>WOS:000964683900016</td>\n <td>Italy</td>\n <td>EU</td>\n <td>3c631398a81ab7058d95a0c6418a2c0b</td>\n </tr>\n <tr>\n <th>643324</th>\n <td>WOS:000964683900016</td>\n <td>Italy</td>\n <td>EU</td>\n <td>3c631398a81ab7058d95a0c6418a2c0b</td>\n </tr>\n <tr>\n <th>643325</th>\n <td>WOS:000967389100001</td>\n <td>China</td>\n <td>China</td>\n <td>ce65541a6c334225a9617439f4a95012</td>\n </tr>\n <tr>\n <th>643326</th>\n <td>WOS:000967389100001</td>\n <td>Norway</td>\n <td>Non-EU associate</td>\n <td>7c52a53f8d79b1ffd4f2e4cde9548e1d</td>\n </tr>\n <tr>\n <th>643327</th>\n <td>WOS:000967389100001</td>\n <td>Norway</td>\n <td>Non-EU associate</td>\n <td>7c52a53f8d79b1ffd4f2e4cde9548e1d</td>\n </tr>\n </tbody>\n</table>\n<p>573569 rows × 4 columns</p>\n</div>"
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"author_locations[author_locations['author_str_id'].duplicated(False)]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"author_primary_region = author_locations.sort_values(by=\"Country_Type\").drop_duplicates(subset=[record_col,\"author_str_id\"])\n",
"# author_primary_region\n",
"\n",
"china=author_primary_region[author_primary_region[\"Country_Type\"]==\"China\"][record_col].unique()\n",
"eu=author_primary_region[author_primary_region[\"Country_Type\"]==\"EU\"][record_col].unique()\n",
"assoc=author_primary_region[author_primary_region[\"Country_Type\"]==\"Non-EU associate\"][record_col].unique()\n",
"\n",
"\n",
"# records that have distinct authors with different country affiliations\n",
"valid_scope = wos[((wos[record_col].isin(china))\n",
" &\n",
" ((wos[record_col].isin(eu))\n",
" |\n",
" (wos[record_col].isin(assoc))))][record_col].unique()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Country Country_Type \n537692 WOS:000732204600001 China China \\\n204027 WOS:000414089800001 China China \n204028 WOS:000414089800001 China China \n204029 WOS:000414089800001 China China \n204030 WOS:000414090800001 China China \n\n author_str_id \n537692 8fe31cbbd07c639aa4d779688896be81 \n204027 67c7beb18fafd77f1319739fa683bc5e \n204028 7269f0a31fc620688aae12aad9e3cd85 \n204029 ac28aea698a527fb5195d3d24189ea04 \n204030 6c91bf481b6bddc1426d12a18823224a ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Country</th>\n <th>Country_Type</th>\n <th>author_str_id</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>537692</th>\n <td>WOS:000732204600001</td>\n <td>China</td>\n <td>China</td>\n <td>8fe31cbbd07c639aa4d779688896be81</td>\n </tr>\n <tr>\n <th>204027</th>\n <td>WOS:000414089800001</td>\n <td>China</td>\n <td>China</td>\n <td>67c7beb18fafd77f1319739fa683bc5e</td>\n </tr>\n <tr>\n <th>204028</th>\n <td>WOS:000414089800001</td>\n <td>China</td>\n <td>China</td>\n <td>7269f0a31fc620688aae12aad9e3cd85</td>\n </tr>\n <tr>\n <th>204029</th>\n <td>WOS:000414089800001</td>\n <td>China</td>\n <td>China</td>\n <td>ac28aea698a527fb5195d3d24189ea04</td>\n </tr>\n <tr>\n <th>204030</th>\n <td>WOS:000414090800001</td>\n <td>China</td>\n <td>China</td>\n <td>6c91bf481b6bddc1426d12a18823224a</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"author_primary_region.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of records: 51904\n",
"Number of valid cooperation records: 46060\n"
]
}
],
"source": [
"print(f'Number of records: {len(wos)}')\n",
"print(f'Number of valid cooperation records: {len(valid_scope)}')"
]
},
{
"cell_type": "code",
"execution_count": 31,
"outputs": [],
"source": [
"wos = wos[wos[record_col].isin(valid_scope)]\n",
"locations = locations[locations[record_col].isin(valid_scope)]\n",
"univ_locations = univ_locations[univ_locations[record_col].isin(valid_scope)]\n",
"author_locations = author_locations[author_locations[record_col].isin(valid_scope)]\n",
"author_primary_region = author_locations[author_locations[record_col].isin(valid_scope)]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"affiliations = wos.groupby(record_col)[\"Affiliations\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
"affiliations[\"Affiliations\"] = affiliations[\"Affiliations\"].str.strip().str.upper().fillna(\"UNKNOWN\")\n",
"affiliations = affiliations.drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"outputs": [
{
"data": {
"text/plain": "Affiliations\nCHINESE ACADEMY OF SCIENCES 5616\nUNIVERSITY OF LONDON 2604\nUDICE-FRENCH RESEARCH UNIVERSITIES 2240\nCENTRE NATIONAL DE LA RECHERCHE SCIENTIFIQUE (CNRS) 2170\nTSINGHUA UNIVERSITY 1935\n ... \nUNIVERSITY OF FUKUI 1\nPONTIFICIA UNIVERSIDADE CATOLICA DE GOIAS 1\nINSTITUTE OF ORGANIC CHEMISTRY & BIOCHEMISTRY OF THE CZECH ACADEMY OF SCIENCES 1\nUNIVERSITAS PELITA HARAPAN 1\nFRANCISCUS GASTHUIS 1\nName: count, Length: 7609, dtype: int64"
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"affiliations[\"Affiliations\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 34,
"outputs": [
{
"data": {
"text/plain": "Institution\nChinese Acad Sci 5749\nTsinghua Univ 2315\nShanghai Jiao Tong Univ 1976\nZhejiang Univ 1806\nPeking Univ 1661\n ... \nNatl Technol Inst Mental Disorders 1\nSeinajoki Univ Appl Sci 1\nJD Intelligent City Res 1\nCAS Ctr Excellence Planetol 1\nKey Lab Intelligent Prevent Med Zhejiang Prov 1\nName: count, Length: 19821, dtype: int64"
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"univ_locations[\"Institution\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 35,
"outputs": [
{
"data": {
"text/plain": "46060"
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"univ_locations[record_col].nunique()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 36,
"outputs": [
{
"data": {
"text/plain": "46060"
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"affiliations[record_col].nunique()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 37,
"outputs": [
{
"data": {
"text/plain": "202790"
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"univ_locations[\"Institution\"].value_counts().sum()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 38,
"outputs": [
{
"data": {
"text/plain": "268471"
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"affiliations[\"Affiliations\"].value_counts().sum()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "WoS Categories\n Engineering, Electrical & Electronic 8303\nComputer Science, Artificial Intelligence 6115\n Telecommunications 4661\nComputer Science, Information Systems 4584\nEngineering, Electrical & Electronic 4036\n ... \nCultural Studies 1\n Ornithology 1\n Criminology & Penology 1\nArt 1\n Psychology, Developmental 1\nName: count, Length: 425, dtype: int64"
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos_cat = wos.groupby(record_col)[\"WoS Categories\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
"wos_cat[\"WoS Categories\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "Research Areas\nEngineering 18098\nComputer Science 15658\nTelecommunications 5046\nEnvironmental Sciences & Ecology 3246\nImaging Science & Photographic Technology 2947\n ... \nFilm, Radio & Television 2\nArea Studies 2\nCultural Studies 1\nAsian Studies 1\nMusic 1\nName: count, Length: 145, dtype: int64"
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos_areas = wos.groupby(record_col)[\"Research Areas\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
"wos_areas[\"Research Areas\"] = wos_areas[\"Research Areas\"].str.strip()\n",
"wos_areas[\"Research Areas\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "['Domain_English', 'Field_English', 'SubField_English']"
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[c for c in wos.columns if \"_English\" in c]"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"metrix_levels = [c for c in wos.columns if \"_English\" in c]\n",
"for m in metrix_levels:\n",
" wos[m] = wos[m].replace({\"article-level classification\":\"Multidisciplinary\"})\n"
]
},
{
"cell_type": "code",
"execution_count": 42,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": " Publication Type Authors \n0 J Yan, Z; Jing, XY; Pedrycz, W \\\n1 J Sookhak, M; Yu, FR; He, Y; Talebian, H; Safa, ... \n2 J Ning, ZL; Dong, PR; Wang, XJ; Guo, L; Rodrigue... \n3 J Wang, XD; Garg, S; Lin, H; Kaddoum, G; Hu, J; ... \n4 J Lu, TG; Chen, XY; McElroy, MB; Nielsen, CP; Wu... \n... ... ... \n51897 J Lai, ZL; Liu, W; Jian, XD; Bacsa, K; Sun, LM; ... \n51898 J Wang, HC; Roussel, P; Denby, B \n51899 J Zhang, R; Alpdogan, S; Kong, SQ; Muhammad, S \n51902 J Chu, WP; Song, Y \n51903 J Lai, CS; Jia, YW; Dong, ZK; Wang, DX; Tao, YS;... \n\n Book Authors Book Editors Book Group Authors \n0 NaN NaN NaN \\\n1 NaN NaN NaN \n2 NaN NaN NaN \n3 NaN NaN NaN \n4 NaN NaN NaN \n... ... ... ... \n51897 NaN NaN NaN \n51898 NaN NaN NaN \n51899 NaN NaN NaN \n51902 NaN NaN NaN \n51903 NaN NaN NaN \n\n Author Full Names \n0 Yan, Zheng; Jing, Xuyang; Pedrycz, Witold \\\n1 Sookhak, Mehdi; Yu, F. Richard; He, Ying; Tale... \n2 Ning, Zhaolong; Dong, Peiran; Wang, Xiaojie; G... \n3 Wang, Xiaoding; Garg, Sahil; Lin, Hui; Kaddoum... \n4 Lu, Tianguang; Chen, Xinyu; McElroy, Michael B... \n... ... \n51897 Lai, Zhilu; Liu, Wei; Jian, Xudong; Bacsa, Kir... \n51898 Wang, Hongcui; Roussel, Pierre; Denby, Bruce \n51899 Zhang, Rui; Alpdogan, Serdar; Kong, Shiqi; Muh... \n51902 Chu, Wenping; Song, Yang \n51903 Lai, Chun Sing; Jia, Youwei; Dong, Zhekang; Wa... \n\n Book Author Full Names Group Authors \n0 NaN NaN \\\n1 NaN NaN \n2 NaN NaN \n3 NaN NaN \n4 NaN NaN \n... ... ... \n51897 NaN NaN \n51898 NaN NaN \n51899 NaN NaN \n51902 NaN NaN \n51903 NaN NaN \n\n Article Title \n0 LEFusing and mining opinions for reputation ge... \\\n1 FOG VEHICULAR COMPUTING Augmentation of Fog Co... \n2 Deep Reinforcement Learning for Intelligent In... \n3 An Intelligent UAV based Data Aggregation Algo... \n4 A Reinforcement Learning-Based Decision System... \n... ... \n51897 Neural modal ordinary differential equations: ... \n51898 Improving ultrasound-based multimodal speech r... \n51899 Application of computer-aided image reconstruc... \n51902 Study on Dynamic Interaction of Railway Pantog... \n51903 A Review of Technical Standards for Smart Cities \n\n Source Title ... \n0 INFORMATION FUSION ... \\\n1 IEEE VEHICULAR TECHNOLOGY MAGAZINE ... \n2 IEEE TRANSACTIONS ON COGNITIVE COMMUNICATIONS ... ... \n3 COMPUTER NETWORKS ... \n4 IEEE TRANSACTIONS ON SMART GRID ... \n... ... ... \n51897 DATA-CENTRIC ENGINEERING ... \n51898 JASA EXPRESS LETTERS ... \n51899 EGYPTIAN JOURNAL OF NEUROSURGERY ... \n51902 VIBRATION ... \n51903 CLEAN TECHNOLOGIES ... \n\n UT (Unique WOS ID) issn_var issn Domain_English \n0 WOS:000394070100013 issn 15662535 Applied Sciences \\\n1 WOS:000408568800008 issn 15566072 Applied Sciences \n2 WOS:000502789700018 issn 23327731 Applied Sciences \n3 WOS:000626758800004 issn 13891286 Applied Sciences \n4 WOS:000641976000028 issn 19493053 Applied Sciences \n... ... ... ... ... \n51897 WOS:000906995300001 eissn NaN Applied Sciences \n51898 WOS:000642230800005 eissn NaN Natural Sciences \n51899 WOS:000807222600001 eissn NaN Health Sciences \n51902 WOS:000661660800001 eissn NaN Applied Sciences \n51903 WOS:000708219500008 eissn NaN Natural Sciences \n\n Field_English \n0 Information & Communication Technologies \\\n1 Information & Communication Technologies \n2 Information & Communication Technologies \n3 Information & Communication Technologies \n4 Enabling & Strategic Technologies \n... ... \n51897 Information & Communication Technologies \n51898 Physics & Astronomy \n51899 Clinical Medicine \n51902 Engineering \n51903 Earth & Environmental Sciences \n\n SubField_English 2.00 SEQ \n0 Artificial Intelligence & Image Processing 31 \\\n1 Networking & Telecommunications 37 \n2 Networking & Telecommunications 37 \n3 Networking & Telecommunications 37 \n4 Energy 14 \n... ... ... \n51897 Artificial Intelligence & Image Processing NaN \n51898 Acoustics NaN \n51899 Neurology & Neurosurgery NaN \n51902 Mechanical Engineering & Transports NaN \n51903 Environmental Sciences NaN \n\n Source_title srcid \n0 Information Fusion 2.609900e+04 \\\n1 IEEE Vehicular Technology Magazine 5.200153e+09 \n2 IEEE Transactions on Cognitive Communications ... 2.110085e+10 \n3 Computer Networks 2.681100e+04 \n4 IEEE Transactions on Smart Grid 1.970017e+10 \n... ... ... \n51897 NaN NaN \n51898 NaN NaN \n51899 NaN NaN \n51902 NaN NaN \n51903 NaN NaN \n\n issn_type \n0 issn1 \n1 issn1 \n2 issn1 \n3 issn1 \n4 issn2 \n... ... \n51897 NaN \n51898 NaN \n51899 NaN \n51902 NaN \n51903 NaN \n\n[46060 rows x 80 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Publication Type</th>\n <th>Authors</th>\n <th>Book Authors</th>\n <th>Book Editors</th>\n <th>Book Group Authors</th>\n <th>Author Full Names</th>\n <th>Book Author Full Names</th>\n <th>Group Authors</th>\n <th>Article Title</th>\n <th>Source Title</th>\n <th>...</th>\n <th>UT (Unique WOS ID)</th>\n <th>issn_var</th>\n <th>issn</th>\n <th>Domain_English</th>\n <th>Field_English</th>\n <th>SubField_English</th>\n <th>2.00 SEQ</th>\n <th>Source_title</th>\n <th>srcid</th>\n <th>issn_type</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>J</td>\n <td>Yan, Z; Jing, XY; Pedrycz, W</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Yan, Zheng; Jing, Xuyang; Pedrycz, Witold</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>LEFusing and mining opinions for reputation ge...</td>\n <td>INFORMATION FUSION</td>\n <td>...</td>\n <td>WOS:000394070100013</td>\n <td>issn</td>\n <td>15662535</td>\n <td>Applied Sciences</td>\n <td>Information &amp; Communication Technologies</td>\n <td>Artificial Intelligence &amp; Image Processing</td>\n <td>31</td>\n <td>Information Fusion</td>\n <td>2.609900e+04</td>\n <td>issn1</td>\n </tr>\n <tr>\n <th>1</th>\n <td>J</td>\n <td>Sookhak, M; Yu, FR; He, Y; Talebian, H; Safa, ...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Sookhak, Mehdi; Yu, F. Richard; He, Ying; Tale...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>FOG VEHICULAR COMPUTING Augmentation of Fog Co...</td>\n <td>IEEE VEHICULAR TECHNOLOGY MAGAZINE</td>\n <td>...</td>\n <td>WOS:000408568800008</td>\n <td>issn</td>\n <td>15566072</td>\n <td>Applied Sciences</td>\n <td>Information &amp; Communication Technologies</td>\n <td>Networking &amp; Telecommunications</td>\n <td>37</td>\n <td>IEEE Vehicular Technology Magazine</td>\n <td>5.200153e+09</td>\n <td>issn1</td>\n </tr>\n <tr>\n <th>2</th>\n <td>J</td>\n <td>Ning, ZL; Dong, PR; Wang, XJ; Guo, L; Rodrigue...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Ning, Zhaolong; Dong, Peiran; Wang, Xiaojie; G...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Deep Reinforcement Learning for Intelligent In...</td>\n <td>IEEE TRANSACTIONS ON COGNITIVE COMMUNICATIONS ...</td>\n <td>...</td>\n <td>WOS:000502789700018</td>\n <td>issn</td>\n <td>23327731</td>\n <td>Applied Sciences</td>\n <td>Information &amp; Communication Technologies</td>\n <td>Networking &amp; Telecommunications</td>\n <td>37</td>\n <td>IEEE Transactions on Cognitive Communications ...</td>\n <td>2.110085e+10</td>\n <td>issn1</td>\n </tr>\n <tr>\n <th>3</th>\n <td>J</td>\n <td>Wang, XD; Garg, S; Lin, H; Kaddoum, G; Hu, J; ...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Wang, Xiaoding; Garg, Sahil; Lin, Hui; Kaddoum...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>An Intelligent UAV based Data Aggregation Algo...</td>\n <td>COMPUTER NETWORKS</td>\n <td>...</td>\n <td>WOS:000626758800004</td>\n <td>issn</td>\n <td>13891286</td>\n <td>Applied Sciences</td>\n <td>Information &amp; Communication Technologies</td>\n <td>Networking &amp; Telecommunications</td>\n <td>37</td>\n <td>Computer Networks</td>\n <td>2.681100e+04</td>\n <td>issn1</td>\n </tr>\n <tr>\n <th>4</th>\n <td>J</td>\n <td>Lu, TG; Chen, XY; McElroy, MB; Nielsen, CP; Wu...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Lu, Tianguang; Chen, Xinyu; McElroy, Michael B...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>A Reinforcement Learning-Based Decision System...</td>\n <td>IEEE TRANSACTIONS ON SMART GRID</td>\n <td>...</td>\n <td>WOS:000641976000028</td>\n <td>issn</td>\n <td>19493053</td>\n <td>Applied Sciences</td>\n <td>Enabling &amp; Strategic Technologies</td>\n <td>Energy</td>\n <td>14</td>\n <td>IEEE Transactions on Smart Grid</td>\n <td>1.970017e+10</td>\n <td>issn2</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>51897</th>\n <td>J</td>\n <td>Lai, ZL; Liu, W; Jian, XD; Bacsa, K; Sun, LM; ...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Lai, Zhilu; Liu, Wei; Jian, Xudong; Bacsa, Kir...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Neural modal ordinary differential equations: ...</td>\n <td>DATA-CENTRIC ENGINEERING</td>\n <td>...</td>\n <td>WOS:000906995300001</td>\n <td>eissn</td>\n <td>NaN</td>\n <td>Applied Sciences</td>\n <td>Information &amp; Communication Technologies</td>\n <td>Artificial Intelligence &amp; Image Processing</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>51898</th>\n <td>J</td>\n <td>Wang, HC; Roussel, P; Denby, B</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Wang, Hongcui; Roussel, Pierre; Denby, Bruce</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Improving ultrasound-based multimodal speech r...</td>\n <td>JASA EXPRESS LETTERS</td>\n <td>...</td>\n <td>WOS:000642230800005</td>\n <td>eissn</td>\n <td>NaN</td>\n <td>Natural Sciences</td>\n <td>Physics &amp; Astronomy</td>\n <td>Acoustics</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>51899</th>\n <td>J</td>\n <td>Zhang, R; Alpdogan, S; Kong, SQ; Muhammad, S</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Zhang, Rui; Alpdogan, Serdar; Kong, Shiqi; Muh...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Application of computer-aided image reconstruc...</td>\n <td>EGYPTIAN JOURNAL OF NEUROSURGERY</td>\n <td>...</td>\n <td>WOS:000807222600001</td>\n <td>eissn</td>\n <td>NaN</td>\n <td>Health Sciences</td>\n <td>Clinical Medicine</td>\n <td>Neurology &amp; Neurosurgery</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>51902</th>\n <td>J</td>\n <td>Chu, WP; Song, Y</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Chu, Wenping; Song, Yang</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Study on Dynamic Interaction of Railway Pantog...</td>\n <td>VIBRATION</td>\n <td>...</td>\n <td>WOS:000661660800001</td>\n <td>eissn</td>\n <td>NaN</td>\n <td>Applied Sciences</td>\n <td>Engineering</td>\n <td>Mechanical Engineering &amp; Transports</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>51903</th>\n <td>J</td>\n <td>Lai, CS; Jia, YW; Dong, ZK; Wang, DX; Tao, YS;...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Lai, Chun Sing; Jia, Youwei; Dong, Zhekang; Wa...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>A Review of Technical Standards for Smart Cities</td>\n <td>CLEAN TECHNOLOGIES</td>\n <td>...</td>\n <td>WOS:000708219500008</td>\n <td>eissn</td>\n <td>NaN</td>\n <td>Natural Sciences</td>\n <td>Earth &amp; Environmental Sciences</td>\n <td>Environmental Sciences</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n </tbody>\n</table>\n<p>46060 rows × 80 columns</p>\n</div>"
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "['Domain_English', 'Field_English', 'SubField_English']"
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"metrix_levels"
]
},
{
"cell_type": "code",
"execution_count": 45,
"outputs": [],
"source": [
"record_countries = locations[[record_col,\"Country\"]].drop_duplicates()\n",
"record_author_locations = author_locations[[record_col,\"author_str_id\",\"Country\"]].drop_duplicates()\n",
"record_institution = univ_locations[[record_col,\"Institution\",\"Country\"]].drop_duplicates()\n",
"country_types = locations[[\"Country\",\"Country_Type\"]].drop_duplicates()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 46,
"outputs": [],
"source": [
"# Basic network layout"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 47,
"outputs": [],
"source": [
"country_collabs = record_countries.merge(record_countries, on=record_col)\n",
"country_collabs = country_collabs[country_collabs[\"Country_x\"]!=country_collabs[\"Country_y\"]]\n",
"country_collabs[\"weight\"] = 0.5"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 48,
"outputs": [],
"source": [
"inst_collabs = record_institution.merge(record_institution, on=record_col)\n",
"inst_collabs = inst_collabs[inst_collabs[\"Institution_x\"]!=inst_collabs[\"Institution_y\"]]\n",
"inst_collabs[\"weight\"] = 0.5"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 49,
"outputs": [
{
"data": {
"text/plain": "Index(['Publication Type', 'Authors', 'Book Authors', 'Book Editors',\n 'Book Group Authors', 'Author Full Names', 'Book Author Full Names',\n 'Group Authors', 'Article Title', 'Source Title', 'Book Series Title',\n 'Book Series Subtitle', 'Language', 'Document Type', 'Conference Title',\n 'Conference Date', 'Conference Location', 'Conference Sponsor',\n 'Conference Host', 'Author Keywords', 'Keywords Plus', 'Abstract',\n 'Addresses', 'Affiliations', 'Reprint Addresses', 'Email Addresses',\n 'Researcher Ids', 'ORCIDs', 'Funding Orgs', 'Funding Name Preferred',\n 'Funding Text', 'Cited References', 'Cited Reference Count',\n 'Times Cited, WoS Core', 'Times Cited, All Databases',\n '180 Day Usage Count', 'Since 2013 Usage Count', 'Publisher',\n 'Publisher City', 'Publisher Address', 'ISSN', 'eISSN', 'ISBN',\n 'Journal Abbreviation', 'Journal ISO Abbreviation', 'Publication Date',\n 'Publication Year', 'Volume', 'Issue', 'Part Number', 'Supplement',\n 'Special Issue', 'Meeting Abstract', 'Start Page', 'End Page',\n 'Article Number', 'DOI', 'DOI Link', 'Book DOI', 'Early Access Date',\n 'Number of Pages', 'WoS Categories', 'Web of Science Index',\n 'Research Areas', 'IDS Number', 'Pubmed Id', 'Open Access Designations',\n 'Highly Cited Status', 'Hot Paper Status', 'Date of Export',\n 'UT (Unique WOS ID)', 'issn_var', 'issn', 'Domain_English',\n 'Field_English', 'SubField_English', '2.00 SEQ', 'Source_title',\n 'srcid', 'issn_type'],\n dtype='object')"
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos.columns"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 50,
"outputs": [
{
"data": {
"text/plain": "['Authors',\n 'Book Authors',\n 'Book Editors',\n 'Book Group Authors',\n 'Author Full Names',\n 'Book Author Full Names',\n 'Group Authors',\n 'Addresses',\n 'Reprint Addresses',\n 'Email Addresses',\n 'Researcher Ids',\n 'ORCIDs',\n 'Publisher Address',\n '2.00 SEQ']"
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"drop_cols = [ws for ws in wos.columns if ((\"uthor\" in ws or \"ddress\" in ws or \"ORCID\" in\n",
" ws or \"esearcher\" in ws or \"ditor\" in ws or \"name\" in ws or 'SEQ' in ws) and \"eyword\" not in ws)]\n",
"drop_cols"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 51,
"outputs": [],
"source": [
"outdir=\"wos_processed_data\""
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 52,
"outputs": [],
"source": [
"os.makedirs(outdir, exist_ok=True)\n",
"\n",
"wos.drop(columns=drop_cols).to_excel(f\"{outdir}/wos_processed.xlsx\", index=False)\n",
"\n",
"record_countries.to_excel(f\"{outdir}/wos_countries.xlsx\", index=False)\n",
"\n",
"record_author_locations.to_excel(f\"{outdir}/wos_author_locations.xlsx\", index=False)\n",
"\n",
"record_institution.to_excel(f\"{outdir}/wos_institution_locations.xlsx\", index=False)\n",
"\n",
"kw_df.to_excel(f\"{outdir}/wos_keywords.xlsx\", index=False)\n",
"\n",
"country_types.to_excel(f\"{outdir}/wos_country_types.xlsx\", index=False)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 53,
"outputs": [],
"source": [
"wos.drop(columns=drop_cols).to_csv(f\"{outdir}/wos_processed.csv\", index=False, sep='\\t')\n",
"\n",
"record_countries.to_csv(f\"{outdir}/wos_countries.csv\", index=False, sep='\\t')\n",
"\n",
"record_author_locations.to_csv(f\"{outdir}/wos_author_locations.csv\", index=False, sep='\\t')\n",
"\n",
"record_institution.to_csv(f\"{outdir}/wos_institution_locations.csv\", index=False, sep='\\t')\n",
"\n",
"kw_df.to_csv(f\"{outdir}/wos_keywords.csv\", index=False, sep='\\t')\n",
"\n",
"country_types.to_csv(f\"{outdir}/wos_country_types.csv\", index=False, sep='\\t')\n",
"\n",
"inst_collabs.to_csv(f\"{outdir}/wos_inst_collabs.csv\", index=False, sep='\\t')\n",
"\n",
"country_collabs.to_csv(f\"{outdir}/wos_country_collabs.csv\", index=False, sep='\\t')"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 54,
"outputs": [],
"source": [
"wos_areas.to_csv(f\"{outdir}/wos_research_areas.csv\", index=False, sep='\\t')\n",
"\n",
"wos_subcat.to_csv(f\"{outdir}/wos_categories.csv\", index=False, sep='\\t')"
],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
}
},
"nbformat": 4,
"nbformat_minor": 1
}