You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ZSI_Reconnect_China/WOS/wos_processing_pipeline.ipynb

1448 lines
686 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import os\n",
"import shutil\n",
"from flashgeotext.geotext import GeoText\n",
"import re"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [],
"source": [
"import hashlib\n",
"\n",
"def md5hash(s: str):\n",
" return hashlib.md5(s.encode('utf-8')).hexdigest()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"record_col=\"UT (Unique WOS ID)\"\n",
"outfile = r\"C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\WOS\\wos_extract\\wos_records_concat.csv\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of initial records: 41511\n",
"Number of filtered records: 35663\n"
]
}
],
"source": [
"wos = pd.read_csv(outfile, sep=\"\\t\",low_memory=False)\n",
"print(f'Number of initial records: {len(wos)}')\n",
"metrix = pd.read_excel(\"sm_journal_classification.xlsx\", sheet_name=\"Journal_Classification\")\n",
"\n",
"\n",
"metrix = metrix.set_index([c for c in metrix.columns if \"issn\" not in c]).stack().reset_index()\n",
"metrix = metrix.rename(columns={'level_6':\"issn_type\", 0:\"issn\"})\n",
"metrix[\"issn\"]=metrix[\"issn\"].str.replace(\"-\",\"\").str.lower().str.strip()\n",
"\n",
"wos[\"issn\"] = wos[\"ISSN\"].str.replace(\"-\",\"\").str.lower().str.strip()\n",
"wos[\"eissn\"] = wos[\"eISSN\"].str.replace(\"-\",\"\").str.lower().str.strip()\n",
"wos = wos.set_index([c for c in wos.columns if \"issn\" not in c]).stack().reset_index()\n",
"wos = wos.rename(columns={'level_71':\"issn_var\", 0:\"issn\"})\n",
"\n",
"wos_merge = wos.merge(metrix, on=\"issn\", how=\"left\")\n",
"wos = wos_merge.sort_values(by=\"issn_var\",ascending=False).drop_duplicates(subset=record_col)\n",
"\n",
"# drop entries not indexed by metrix\n",
"wos = wos[~wos[\"Domain_English\"].isna()]\n",
"# drop duplicates (based on doi)\n",
"wos = wos[~((~wos[\"DOI\"].isna())&(wos[\"DOI\"].duplicated(False)))]\n",
"wos = wos.drop_duplicates(subset=[\"Publication Type\",\"Document Type\",\"Authors\",\"Article Title\",\"Source Title\",\"Publication Year\"])\n",
"wos = wos[((wos[\"Publication Year\"]<2023) & (~wos['Domain_English'].isna()))]\n",
"print(f'Number of filtered records: {len(wos)}')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"outputs": [
{
"data": {
"text/plain": "WoS Categories\nEngineering, Electrical & Electronic 9344\nComputer Science, Artificial Intelligence 6045\nComputer Science, Information Systems 5162\nTelecommunications 3929\nComputer Science, Theory & Methods 2706\n ... \nLiterature 1\nEducation, Special 1\nDemography 1\nSocial Work 1\nWomen's Studies 1\nName: count, Length: 234, dtype: int64"
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos_cat = wos.groupby(record_col)[\"WoS Categories\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
"wos_cat[\"WoS Categories\"] = wos_cat[\"WoS Categories\"].str.strip()\n",
"wos_cat[\"WoS Categories\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 16,
"outputs": [
{
"data": {
"text/plain": "WoS Category\nEngineering 14168\nComputer Science 13807\nTelecommunications 3929\nImaging Science & Photographic Technology 2155\nAutomation & Control Systems 1964\n ... \nLiterature 1\nDemography 1\nWomen's Studies 1\nSocial Work 1\nMusic 1\nName: count, Length: 176, dtype: int64"
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos_subcat = wos_cat.copy()\n",
"wos_subcat[['WoS Category', 'WoS SubCategory']] = wos_subcat[\"WoS Categories\"].str.split(\",\", expand = True, n=1)\n",
"for c in ['WoS Category', 'WoS SubCategory',\"WoS Categories\"]:\n",
" wos_subcat[c] = wos_subcat[c].str.strip()\n",
"wos_subcat.drop_duplicates(subset=[record_col,'WoS Category'])[\"WoS Category\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 6,
"outputs": [
{
"data": {
"text/plain": "Research Areas\nEngineering 14204\nComputer Science 13807\nTelecommunications 3929\nEnvironmental Sciences & Ecology 2156\nImaging Science & Photographic Technology 2155\n ... \nCultural Studies 1\nAsian Studies 1\nMusic 1\nDemography 1\nSocial Work 1\nName: count, Length: 147, dtype: int64"
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos_areas = wos.groupby(record_col)[\"Research Areas\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
"wos_areas[\"Research Areas\"] = wos_areas[\"Research Areas\"].str.strip()\n",
"wos_areas[\"Research Areas\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 101,
"outputs": [
{
"data": {
"text/plain": " Article Title \n28929 Superpixel Nonlocal Weighting Joint Sparse Rep... \\\n8360 Graph topology enhancement for text classifica... \n42582 Application of machine learning and rough set ... \n32203 BUILDING ROBUST SPOKEN LANGUAGE UNDERSTANDING ... \n37519 Mining of High-Utility Patterns in Big IoT-bas... \n... ... \n21487 Long-range precipitation forecast based on mul... \n37473 Big data fusion in Internet of Things \n27468 An Effective Approach for Selection of Terrain... \n8955 BlockHammer: Improving Flash Reliability by Ex... \n67744 Deeply Supervised Salient Object Detection wit... \n\n Keywords Plus \n28929 DIMENSIONALITY REDUCTION; FEATURE-EXTRACTION; ... \\\n8360 NaN \n42582 PREDICTIVE MAINTENANCE; FRAMEWORK; SELECTION; ... \n32203 NETWORKS \n37519 FREQUENT ITEMSETS; UNCERTAIN; DISCOVERY \n... ... \n21487 YANGTZE-RIVER BASIN; INTERDECADAL VARIABILITY;... \n37473 NaN \n27468 ERROR ANALYSIS \n8955 MEMORY; PERFORMANCE; RETENTION; ENDURANCE; OPT... \n67744 IMAGE; ATTENTION; MODEL \n\n Author Keywords \n28929 spatial-spectral fusion; joint sparse represen... \n8360 Text classification; Graph neural networks; To... \n42582 maintenance; availability; machine learning; d... \n32203 Spoken Language Understanding; NLU Robustness;... \n37519 IoT data analytics; Utility patterns; Data min... \n... ... \n21487 long range; multipole SSTA; preceding fluctuat... \n37473 NaN \n27468 Support vector machine (SVM); terrain classifi... \n8955 Reliability; Three-dimensional displays; Error... \n67744 Salient object detection; short connection; de... \n\n[100 rows x 3 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Article Title</th>\n <th>Keywords Plus</th>\n <th>Author Keywords</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>28929</th>\n <td>Superpixel Nonlocal Weighting Joint Sparse Rep...</td>\n <td>DIMENSIONALITY REDUCTION; FEATURE-EXTRACTION; ...</td>\n <td>spatial-spectral fusion; joint sparse represen...</td>\n </tr>\n <tr>\n <th>8360</th>\n <td>Graph topology enhancement for text classifica...</td>\n <td>NaN</td>\n <td>Text classification; Graph neural networks; To...</td>\n </tr>\n <tr>\n <th>42582</th>\n <td>Application of machine learning and rough set ...</td>\n <td>PREDICTIVE MAINTENANCE; FRAMEWORK; SELECTION; ...</td>\n <td>maintenance; availability; machine learning; d...</td>\n </tr>\n <tr>\n <th>32203</th>\n <td>BUILDING ROBUST SPOKEN LANGUAGE UNDERSTANDING ...</td>\n <td>NETWORKS</td>\n <td>Spoken Language Understanding; NLU Robustness;...</td>\n </tr>\n <tr>\n <th>37519</th>\n <td>Mining of High-Utility Patterns in Big IoT-bas...</td>\n <td>FREQUENT ITEMSETS; UNCERTAIN; DISCOVERY</td>\n <td>IoT data analytics; Utility patterns; Data min...</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>21487</th>\n <td>Long-range precipitation forecast based on mul...</td>\n <td>YANGTZE-RIVER BASIN; INTERDECADAL VARIABILITY;...</td>\n <td>long range; multipole SSTA; preceding fluctuat...</td>\n </tr>\n <tr>\n <th>37473</th>\n <td>Big data fusion in Internet of Things</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>27468</th>\n <td>An Effective Approach for Selection of Terrain...</td>\n <td>ERROR ANALYSIS</td>\n <td>Support vector machine (SVM); terrain classifi...</td>\n </tr>\n <tr>\n <th>8955</th>\n <td>BlockHammer: Improving Flash Reliability by Ex...</td>\n <td>MEMORY; PERFORMANCE; RETENTION; ENDURANCE; OPT...</td>\n <td>Reliability; Three-dimensional displays; Error...</td>\n </tr>\n <tr>\n <th>67744</th>\n <td>Deeply Supervised Salient Object Detection wit...</td>\n <td>IMAGE; ATTENTION; MODEL</td>\n <td>Salient object detection; short connection; de...</td>\n </tr>\n </tbody>\n</table>\n<p>100 rows × 3 columns</p>\n</div>"
},
"execution_count": 101,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos[[\"Article Title\",\"Keywords Plus\",\"Author Keywords\"]].sample(100)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 102,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) keyword_all\n0 WOS:000208863600013 COMPARATIVE GENOMICS\n1 WOS:000208863600013 ANAMMOX\n2 WOS:000208863600013 KUENENIA STUTTGARTIENSIS\n3 WOS:000208863600013 METAGENOMICS\n4 WOS:000208863600013 ENRICHMENT CULTURE\n.. ... ...\n97 WOS:000209724300006 VIRTUAL DISKS\n98 WOS:000209724300006 HETEROGENEOUS SERVICES\n99 WOS:000209810700046 CORROSION CHARACTERIZATION\n100 WOS:000209810700046 FEATURE EXTRACTION\n101 WOS:000209810700046 PULSED EDDY CURRENT\n\n[100 rows x 2 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>keyword_all</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000208863600013</td>\n <td>COMPARATIVE GENOMICS</td>\n </tr>\n <tr>\n <th>1</th>\n <td>WOS:000208863600013</td>\n <td>ANAMMOX</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000208863600013</td>\n <td>KUENENIA STUTTGARTIENSIS</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000208863600013</td>\n <td>METAGENOMICS</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000208863600013</td>\n <td>ENRICHMENT CULTURE</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>97</th>\n <td>WOS:000209724300006</td>\n <td>VIRTUAL DISKS</td>\n </tr>\n <tr>\n <th>98</th>\n <td>WOS:000209724300006</td>\n <td>HETEROGENEOUS SERVICES</td>\n </tr>\n <tr>\n <th>99</th>\n <td>WOS:000209810700046</td>\n <td>CORROSION CHARACTERIZATION</td>\n </tr>\n <tr>\n <th>100</th>\n <td>WOS:000209810700046</td>\n <td>FEATURE EXTRACTION</td>\n </tr>\n <tr>\n <th>101</th>\n <td>WOS:000209810700046</td>\n <td>PULSED EDDY CURRENT</td>\n </tr>\n </tbody>\n</table>\n<p>100 rows × 2 columns</p>\n</div>"
},
"execution_count": 102,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kw_df = pd.DataFrame()\n",
"for c in [\"Keywords Plus\",\"Author Keywords\"]:\n",
" kwp = wos.groupby(record_col)[c].apply(lambda x: x.str.split(';')).explode().str.strip().str.upper()\n",
" kwp.name = 'keyword_all'\n",
" kw_df = pd.concat([kwp.reset_index(),kw_df],ignore_index=True)\n",
"kw_df = kw_df[~kw_df[\"keyword_all\"].isna()].copy().drop(columns=\"level_1\").drop_duplicates()\n",
"kw_df[\"keyword_all\"] = kw_df[\"keyword_all\"].apply(lambda x: re.sub(\"[\\(\\[].*?[\\)\\]]\", \"\", x))\n",
"kw_df.head(100)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 103,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) keyword_all\n0 WOS:000208863600013 COMPARATIVE GENOMICS; ANAMMOX; KUENENIA STUTTG...\n1 WOS:000208863600266 ANME; PYROSEQUENCING; AOM; COMMUNITY STRUCTURE...\n2 WOS:000208863900217 DEFAULT MODE NETWORK; EFFECTIVE CONNECTIVITY; ...\n3 WOS:000208972600008 BRAIN-MACHINE INTERFACE ; FIELD-PROGRAMMABLE G...\n4 WOS:000209043200014 CYANOBACTERIA BLOOM; DRINKING WATER TREATMENT;...",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>keyword_all</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000208863600013</td>\n <td>COMPARATIVE GENOMICS; ANAMMOX; KUENENIA STUTTG...</td>\n </tr>\n <tr>\n <th>1</th>\n <td>WOS:000208863600266</td>\n <td>ANME; PYROSEQUENCING; AOM; COMMUNITY STRUCTURE...</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000208863900217</td>\n <td>DEFAULT MODE NETWORK; EFFECTIVE CONNECTIVITY; ...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000208972600008</td>\n <td>BRAIN-MACHINE INTERFACE ; FIELD-PROGRAMMABLE G...</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000209043200014</td>\n <td>CYANOBACTERIA BLOOM; DRINKING WATER TREATMENT;...</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 103,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos_kwd_concat = kw_df.groupby(record_col, as_index=False).agg({'keyword_all': '; '.join})\n",
"wos_kwd_concat.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 103,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 104,
"outputs": [
{
"data": {
"text/plain": "Index(['Publication Type', 'Authors', 'Book Authors', 'Book Editors',\n 'Book Group Authors', 'Author Full Names', 'Book Author Full Names',\n 'Group Authors', 'Article Title', 'Source Title', 'Book Series Title',\n 'Book Series Subtitle', 'Language', 'Document Type', 'Conference Title',\n 'Conference Date', 'Conference Location', 'Conference Sponsor',\n 'Conference Host', 'Author Keywords', 'Keywords Plus', 'Abstract',\n 'Addresses', 'Affiliations', 'Reprint Addresses', 'Email Addresses',\n 'Researcher Ids', 'ORCIDs', 'Funding Orgs', 'Funding Name Preferred',\n 'Funding Text', 'Cited References', 'Cited Reference Count',\n 'Times Cited, WoS Core', 'Times Cited, All Databases',\n '180 Day Usage Count', 'Since 2013 Usage Count', 'Publisher',\n 'Publisher City', 'Publisher Address', 'ISSN', 'eISSN', 'ISBN',\n 'Journal Abbreviation', 'Journal ISO Abbreviation', 'Publication Date',\n 'Publication Year', 'Volume', 'Issue', 'Part Number', 'Supplement',\n 'Special Issue', 'Meeting Abstract', 'Start Page', 'End Page',\n 'Article Number', 'DOI', 'DOI Link', 'Book DOI', 'Early Access Date',\n 'Number of Pages', 'WoS Categories', 'Web of Science Index',\n 'Research Areas', 'IDS Number', 'Pubmed Id', 'Open Access Designations',\n 'Highly Cited Status', 'Hot Paper Status', 'Date of Export',\n 'UT (Unique WOS ID)', 'issn_var', 'issn', 'Domain_English',\n 'Field_English', 'SubField_English', '2.00 SEQ', 'Source_title',\n 'srcid', 'issn_type'],\n dtype='object')"
},
"execution_count": 104,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos.columns"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {},
"outputs": [],
"source": [
"geotext = GeoText()\n",
"\n",
"def extract_location(input_text, key='countries'):\n",
" anomalies = {\"Malta\":\"Malta\",\n",
" \"Mongolia\":\"Mongolia\",\n",
" \"Quatar\":\"Qatar\",\n",
" \"Qatar\":\"Qatar\",\n",
" \"Ethiop\":\"Ethiopia\",\n",
" \"Nigeria\":\"Nigeria\",\n",
" \"BELAR\":\"Belarus\",\n",
" \"Venezuela\":\"Venezuela\",\n",
" \"Cyprus\":\"Cyprus\",\n",
" \"Ecuador\":\"Ecuador\",\n",
" \"U Arab\":\"United Arab Emirates\",\n",
" \"Syria\":\"Syria\",\n",
" \"Uganda\":\"Uganda\",\n",
" \"Yemen\":\"Yemen\",\n",
" \"Mali\":\"Mali\",\n",
" \"Senegal\":\"Senegal\",\n",
" \"Vatican\":\"Vatican\",\n",
" \"Uruguay\":\"Uruguay\",\n",
" \"Panama\":\"Panama\",\n",
" \"Fiji\":\"Fiji\",\n",
" \"Faroe\":\"Faroe Islands\",\n",
" \"Macedonia\":\"Macedonia\",\n",
" 'Mozambique':'Mozambique',\n",
" \"Kuwait\":\"Kuwait\",\n",
" \"Libya\":\"Libya\",\n",
" \"Turkiy\":\"Turkey\",\n",
" \"Liberia\":\"Liberia\",\n",
" \"Namibia\":\"Namibia\",\n",
" \"Ivoire\":\"Ivory Coast\",\n",
" \"Guatemala\":\"Gutemala\",\n",
" \"Paraguay\":\"Paraguay\",\n",
" \"Honduras\":\"Honduras\",\n",
" \"Nicaragua\":\"Nicaragua\",\n",
" \"Trinidad\":\"Trinidad & Tobago\",\n",
" \"Liechtenstein\":\"Liechtenstein\",\n",
" \"Greenland\":\"Denmark\"}\n",
"\n",
" extracted = geotext.extract(input_text=input_text)\n",
" found = extracted[key].keys()\n",
" if len(sorted(found))>0:\n",
" return sorted(found)[0]\n",
" elif key=='countries':\n",
" for i in ['Scotland','Wales','England', 'N Ireland']:\n",
" if i in input_text:\n",
" return 'United Kingdom'\n",
" for j in anomalies.keys():\n",
" if j in input_text:\n",
" return anomalies.get(j)\n",
" else:\n",
" return None\n",
"\n",
"with open('../eu_members.txt',\"r\") as f:\n",
" eu_countries=f.readline().split(\",\")\n",
" eu_countries=[i.strip() for i in eu_countries]\n",
"\n",
"def country_cleanup(country):\n",
" if \"USA\" in country:\n",
" return \"USA\"\n",
" elif \"China\" in country:\n",
" return \"China\"\n",
" elif country in [\"England\", \"Northern Ireland\", \"Wales\", \"Scotland\",\"N Ireland\"]:\n",
" return \"United Kingdom\"\n",
" else:\n",
" return country\n",
"\n",
"\n",
"def country_type(country):\n",
" if country in eu_countries:\n",
" return \"EU\"\n",
" elif country==\"China\":\n",
" return \"China\"\n",
" elif country in [\"Switzerland\", 'Norway','United Kingdom']:\n",
" return \"Non-EU associate\"\n",
" else:\n",
" return \"Other\"\n"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {},
"outputs": [],
"source": [
"locations = wos.groupby(record_col)[\"Addresses\"].apply(lambda x: x.str.split('[')).explode().reset_index().drop(columns=\"level_1\")\n",
"\n",
"\n",
"locations = locations[locations[\"Addresses\"]!=\"\"].copy()\n",
"locations[\"Address\"] = locations[\"Addresses\"].apply(lambda x:x.split(\"]\")[-1])\n",
"locations[\"Authors_of_address\"] = locations[\"Addresses\"].apply(lambda x:x.split(\"]\")[0])"
]
},
{
"cell_type": "code",
"execution_count": 107,
"outputs": [
{
"data": {
"text/plain": "212138"
},
"execution_count": 107,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(locations)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 108,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Authors_of_address \n0 WOS:000208863600013 Hu, Baolan \\\n1 WOS:000208863600013 Jetten, Mike S. M. \n2 WOS:000208863600013 Speth, Daan R.; Bosch, Niek; Keltjens, Jan T.;... \n3 WOS:000208863600013 Stunnenberg, Henk G. \n4 WOS:000208863600266 Chen, Yifeng \n.. ... ... \n95 WOS:000209843500045 Blautzik, Janusch; Meindl, Thomas \n96 WOS:000209843500045 Breitner, John C. S. \n97 WOS:000209843500045 Buckner, Randy L. \n98 WOS:000209843500045 Calhoun, Vince D.; Courtney, William; King, Ma... \n99 WOS:000209843500045 Castellanos, F. Xavier; Colcombe, Stanley J.; ... \n\n Address \n0 Zhejiang Univ, Dept Environm Engn, Hangzhou 31... \n1 Delft Univ Technol, Dept Biotechnol, Delft, Ne... \n2 Radboud Univ Nijmegen, Dept Microbiol, Inst Wa... \n3 Radboud Univ Nijmegen, Dept Mol Biol, Nijmegen... \n4 Chinese Acad Sci, Guangzhou Inst Geochem, Guan... \n.. ... \n95 Ludwig Maximilians Univ Munchen, Inst Clin Rad... \n96 McGill Univ, Douglas Inst, Dept Psychiat, Ctr ... \n97 Harvard Univ, Dept Psychol, Cambridge, MA 0213... \n98 Mind Res Network, Albuquerque, NM 87106 USA \n99 Nathan S Kline Inst Psychiat Res, Orangeburg, ... \n\n[100 rows x 3 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Authors_of_address</th>\n <th>Address</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000208863600013</td>\n <td>Hu, Baolan</td>\n <td>Zhejiang Univ, Dept Environm Engn, Hangzhou 31...</td>\n </tr>\n <tr>\n <th>1</th>\n <td>WOS:000208863600013</td>\n <td>Jetten, Mike S. M.</td>\n <td>Delft Univ Technol, Dept Biotechnol, Delft, Ne...</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000208863600013</td>\n <td>Speth, Daan R.; Bosch, Niek; Keltjens, Jan T.;...</td>\n <td>Radboud Univ Nijmegen, Dept Microbiol, Inst Wa...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000208863600013</td>\n <td>Stunnenberg, Henk G.</td>\n <td>Radboud Univ Nijmegen, Dept Mol Biol, Nijmegen...</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000208863600266</td>\n <td>Chen, Yifeng</td>\n <td>Chinese Acad Sci, Guangzhou Inst Geochem, Guan...</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>95</th>\n <td>WOS:000209843500045</td>\n <td>Blautzik, Janusch; Meindl, Thomas</td>\n <td>Ludwig Maximilians Univ Munchen, Inst Clin Rad...</td>\n </tr>\n <tr>\n <th>96</th>\n <td>WOS:000209843500045</td>\n <td>Breitner, John C. S.</td>\n <td>McGill Univ, Douglas Inst, Dept Psychiat, Ctr ...</td>\n </tr>\n <tr>\n <th>97</th>\n <td>WOS:000209843500045</td>\n <td>Buckner, Randy L.</td>\n <td>Harvard Univ, Dept Psychol, Cambridge, MA 0213...</td>\n </tr>\n <tr>\n <th>98</th>\n <td>WOS:000209843500045</td>\n <td>Calhoun, Vince D.; Courtney, William; King, Ma...</td>\n <td>Mind Res Network, Albuquerque, NM 87106 USA</td>\n </tr>\n <tr>\n <th>99</th>\n <td>WOS:000209843500045</td>\n <td>Castellanos, F. Xavier; Colcombe, Stanley J.; ...</td>\n <td>Nathan S Kline Inst Psychiat Res, Orangeburg, ...</td>\n </tr>\n </tbody>\n</table>\n<p>100 rows × 3 columns</p>\n</div>"
},
"execution_count": 108,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"locations[\"Address\"] = locations[\"Address\"].str.strip().str.strip(\";\")\n",
"locations = locations.groupby([record_col,\"Authors_of_address\"])[\"Address\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_2\")\n",
"locations.head(100)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 109,
"outputs": [],
"source": [
"# import dask.dataframe as dd\n",
"#\n",
"# locations_ddf = dd.from_pandas(locations, npartitions=4) # convert pandas DataFrame to Dask DataFrame\n",
"# loc_compute = locations_ddf.groupby([record_col,\"Authors_of_address\"])[\"Address\"].apply(lambda x: x.str.split(';')).explode().compute() # compute the result"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 110,
"outputs": [],
"source": [
"# locations_test = locations.head(1000)\n",
"# locations_test = locations_test.groupby([record_col,\"Authors_of_address\"])[\"Address\"].str.split(';').explode()\n",
"# locations_test"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 111,
"outputs": [],
"source": [
"\n",
"# locations[\"Country\"]=locations['Address'].apply(lambda x: extract_location(input_text=x, key='countries'))\n",
"locations[\"Country\"]=locations['Address'].apply(lambda x: x.split(\",\")[-1].strip(\" \").strip(\";\").strip(\" \"))\n",
"locations[\"Country\"]=locations['Country'].apply(lambda x: country_cleanup(x))\n",
"locations[\"City\"]=locations['Address'].apply(lambda x: extract_location(input_text=x, key='cities'))\n",
"locations[\"Country_Type\"] = locations[\"Country\"].apply(lambda x: country_type(x))"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 111,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 112,
"outputs": [],
"source": [
"scope_types = [\"EU\",\"China\",\"Non-EU associate\"]\n",
"locations=locations[locations[\"Country_Type\"].isin(scope_types)]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Address \n0 WOS:000208863600013 Zhejiang Univ, Dept Environm Engn, Hangzhou 31... \\\n1 WOS:000208863600013 Delft Univ Technol, Dept Biotechnol, Delft, Ne... \n2 WOS:000208863600013 Radboud Univ Nijmegen, Dept Microbiol, Inst Wa... \n3 WOS:000208863600013 Radboud Univ Nijmegen, Dept Mol Biol, Nijmegen... \n4 WOS:000208863600266 Chinese Acad Sci, Guangzhou Inst Geochem, Guan... \n\n Country City Country_Type Institution \n0 China Hangzhou China Zhejiang Univ \n1 Netherlands Delft EU Delft Univ Technol \n2 Netherlands Nijmegen EU Radboud Univ Nijmegen \n3 Netherlands Mol EU Radboud Univ Nijmegen \n4 China Guangzhou China Chinese Acad Sci ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Address</th>\n <th>Country</th>\n <th>City</th>\n <th>Country_Type</th>\n <th>Institution</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000208863600013</td>\n <td>Zhejiang Univ, Dept Environm Engn, Hangzhou 31...</td>\n <td>China</td>\n <td>Hangzhou</td>\n <td>China</td>\n <td>Zhejiang Univ</td>\n </tr>\n <tr>\n <th>1</th>\n <td>WOS:000208863600013</td>\n <td>Delft Univ Technol, Dept Biotechnol, Delft, Ne...</td>\n <td>Netherlands</td>\n <td>Delft</td>\n <td>EU</td>\n <td>Delft Univ Technol</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000208863600013</td>\n <td>Radboud Univ Nijmegen, Dept Microbiol, Inst Wa...</td>\n <td>Netherlands</td>\n <td>Nijmegen</td>\n <td>EU</td>\n <td>Radboud Univ Nijmegen</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000208863600013</td>\n <td>Radboud Univ Nijmegen, Dept Mol Biol, Nijmegen...</td>\n <td>Netherlands</td>\n <td>Mol</td>\n <td>EU</td>\n <td>Radboud Univ Nijmegen</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000208863600266</td>\n <td>Chinese Acad Sci, Guangzhou Inst Geochem, Guan...</td>\n <td>China</td>\n <td>Guangzhou</td>\n <td>China</td>\n <td>Chinese Acad Sci</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 113,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"univ_locations = locations[[record_col,\"Address\",\"Country\",\"City\",\"Country_Type\"]].copy()\n",
"univ_locations[\"Institution\"] = univ_locations[\"Address\"].apply(lambda x: x.split(\",\")[0])\n",
"univ_locations = univ_locations.drop_duplicates()\n",
"univ_locations.head()"
]
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Country Country_Type \n0 WOS:000208863600013 China China \\\n1 WOS:000208863600013 Netherlands EU \n2 WOS:000208863600013 Netherlands EU \n3 WOS:000208863600013 Netherlands EU \n4 WOS:000208863600013 Netherlands EU \n\n author_str_id \n0 54c7bc6fe9b77434ca1bf04d763d843b \n1 df81f9da6c8f5c968c16ef0aab1bb8f9 \n2 6a775fcd8d11fcb084671b8cae4d6305 \n3 aa6accfdf7626441fe9191636dab4c35 \n4 b707b51d1ca3b5aa76de6ce6df20e6e4 ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Country</th>\n <th>Country_Type</th>\n <th>author_str_id</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000208863600013</td>\n <td>China</td>\n <td>China</td>\n <td>54c7bc6fe9b77434ca1bf04d763d843b</td>\n </tr>\n <tr>\n <th>1</th>\n <td>WOS:000208863600013</td>\n <td>Netherlands</td>\n <td>EU</td>\n <td>df81f9da6c8f5c968c16ef0aab1bb8f9</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000208863600013</td>\n <td>Netherlands</td>\n <td>EU</td>\n <td>6a775fcd8d11fcb084671b8cae4d6305</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000208863600013</td>\n <td>Netherlands</td>\n <td>EU</td>\n <td>aa6accfdf7626441fe9191636dab4c35</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000208863600013</td>\n <td>Netherlands</td>\n <td>EU</td>\n <td>b707b51d1ca3b5aa76de6ce6df20e6e4</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 114,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"author_locations = locations.groupby([record_col,\"Country\",\"Country_Type\"])[\"Authors_of_address\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_3\")\n",
"author_locations[\"Author_name\"] = author_locations[\"Authors_of_address\"].str.strip()\n",
"author_locations = author_locations.drop(columns=\"Authors_of_address\")\n",
"author_locations[\"author_str_id\"] = author_locations[\"Author_name\"].apply(lambda x:''.join(filter(str.isalnum, x.lower())))\n",
"author_locations[\"author_str_id\"] = author_locations[\"author_str_id\"].apply(md5hash)\n",
"author_locations = author_locations.drop(columns=\"Author_name\")\n",
"author_locations.head()"
]
},
{
"cell_type": "code",
"execution_count": 115,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Country Country_Type \n0 WOS:000208863600013 China China \\\n1 WOS:000208863600013 Netherlands EU \n5 WOS:000208863600013 Netherlands EU \n7 WOS:000208863600266 China China \n13 WOS:000208863900217 China China \n... ... ... ... \n438826 WOS:000951829800021 China China \n438827 WOS:000951829800021 Netherlands EU \n438828 WOS:000952055000007 China China \n438829 WOS:000952055000007 China China \n438831 WOS:000952055000007 United Kingdom Non-EU associate \n\n author_str_id \n0 54c7bc6fe9b77434ca1bf04d763d843b \n1 df81f9da6c8f5c968c16ef0aab1bb8f9 \n5 df81f9da6c8f5c968c16ef0aab1bb8f9 \n7 5dfb4f0408a2cc8b7f36f5516938b62c \n13 00e44aa0a23a3fc9571b1053a4453a54 \n... ... \n438826 fc15bf7c800877e1c33f4a7397840faa \n438827 6b8763361150d7c3ceecf9eca9efd83b \n438828 80231479c1502ce8649717236023b6c9 \n438829 0af23824e538b0816c19239079d58c77 \n438831 b77dd6bc0ae30a2f96d43eebb1b3d89a \n\n[384417 rows x 4 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Country</th>\n <th>Country_Type</th>\n <th>author_str_id</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000208863600013</td>\n <td>China</td>\n <td>China</td>\n <td>54c7bc6fe9b77434ca1bf04d763d843b</td>\n </tr>\n <tr>\n <th>1</th>\n <td>WOS:000208863600013</td>\n <td>Netherlands</td>\n <td>EU</td>\n <td>df81f9da6c8f5c968c16ef0aab1bb8f9</td>\n </tr>\n <tr>\n <th>5</th>\n <td>WOS:000208863600013</td>\n <td>Netherlands</td>\n <td>EU</td>\n <td>df81f9da6c8f5c968c16ef0aab1bb8f9</td>\n </tr>\n <tr>\n <th>7</th>\n <td>WOS:000208863600266</td>\n <td>China</td>\n <td>China</td>\n <td>5dfb4f0408a2cc8b7f36f5516938b62c</td>\n </tr>\n <tr>\n <th>13</th>\n <td>WOS:000208863900217</td>\n <td>China</td>\n <td>China</td>\n <td>00e44aa0a23a3fc9571b1053a4453a54</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>438826</th>\n <td>WOS:000951829800021</td>\n <td>China</td>\n <td>China</td>\n <td>fc15bf7c800877e1c33f4a7397840faa</td>\n </tr>\n <tr>\n <th>438827</th>\n <td>WOS:000951829800021</td>\n <td>Netherlands</td>\n <td>EU</td>\n <td>6b8763361150d7c3ceecf9eca9efd83b</td>\n </tr>\n <tr>\n <th>438828</th>\n <td>WOS:000952055000007</td>\n <td>China</td>\n <td>China</td>\n <td>80231479c1502ce8649717236023b6c9</td>\n </tr>\n <tr>\n <th>438829</th>\n <td>WOS:000952055000007</td>\n <td>China</td>\n <td>China</td>\n <td>0af23824e538b0816c19239079d58c77</td>\n </tr>\n <tr>\n <th>438831</th>\n <td>WOS:000952055000007</td>\n <td>United Kingdom</td>\n <td>Non-EU associate</td>\n <td>b77dd6bc0ae30a2f96d43eebb1b3d89a</td>\n </tr>\n </tbody>\n</table>\n<p>384417 rows × 4 columns</p>\n</div>"
},
"execution_count": 115,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"author_locations[author_locations['author_str_id'].duplicated(False)]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {},
"outputs": [],
"source": [
"author_primary_region = author_locations.sort_values(by=\"Country_Type\").drop_duplicates(subset=[record_col,\"author_str_id\"])\n",
"# author_primary_region\n",
"\n",
"china=author_primary_region[author_primary_region[\"Country_Type\"]==\"China\"][record_col].unique()\n",
"eu=author_primary_region[author_primary_region[\"Country_Type\"]==\"EU\"][record_col].unique()\n",
"assoc=author_primary_region[author_primary_region[\"Country_Type\"]==\"Non-EU associate\"][record_col].unique()\n",
"\n",
"\n",
"# records that have distinct authors with different country affiliations\n",
"valid_scope = wos[((wos[record_col].isin(china))\n",
" &\n",
" ((wos[record_col].isin(eu))\n",
" |\n",
" (wos[record_col].isin(assoc))))][record_col].unique()"
]
},
{
"cell_type": "code",
"execution_count": 117,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Country Country_Type \n0 WOS:000208863600013 China China \\\n114146 WOS:000404623900013 China China \n114147 WOS:000404623900013 China China \n330506 WOS:000704130600006 China China \n114149 WOS:000404623900039 China China \n\n author_str_id \n0 54c7bc6fe9b77434ca1bf04d763d843b \n114146 e0d590d171727e520f187ff576a3608c \n114147 773953f1e94a1293cc8417da7b6e435d \n330506 d5296d1bbee9f1c6d4f33e6ae493410a \n114149 88907be51c34b8883aed738d51844b9e ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Country</th>\n <th>Country_Type</th>\n <th>author_str_id</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000208863600013</td>\n <td>China</td>\n <td>China</td>\n <td>54c7bc6fe9b77434ca1bf04d763d843b</td>\n </tr>\n <tr>\n <th>114146</th>\n <td>WOS:000404623900013</td>\n <td>China</td>\n <td>China</td>\n <td>e0d590d171727e520f187ff576a3608c</td>\n </tr>\n <tr>\n <th>114147</th>\n <td>WOS:000404623900013</td>\n <td>China</td>\n <td>China</td>\n <td>773953f1e94a1293cc8417da7b6e435d</td>\n </tr>\n <tr>\n <th>330506</th>\n <td>WOS:000704130600006</td>\n <td>China</td>\n <td>China</td>\n <td>d5296d1bbee9f1c6d4f33e6ae493410a</td>\n </tr>\n <tr>\n <th>114149</th>\n <td>WOS:000404623900039</td>\n <td>China</td>\n <td>China</td>\n <td>88907be51c34b8883aed738d51844b9e</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 117,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"author_primary_region.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of records: 35663\n",
"Number of valid cooperation records: 31574\n"
]
}
],
"source": [
"print(f'Number of records: {len(wos)}')\n",
"print(f'Number of valid cooperation records: {len(valid_scope)}')"
]
},
{
"cell_type": "code",
"execution_count": 119,
"outputs": [],
"source": [
"wos = wos[wos[record_col].isin(valid_scope)]\n",
"locations = locations[locations[record_col].isin(valid_scope)]\n",
"univ_locations = univ_locations[univ_locations[record_col].isin(valid_scope)]\n",
"author_locations = author_locations[author_locations[record_col].isin(valid_scope)]\n",
"author_primary_region = author_locations[author_locations[record_col].isin(valid_scope)]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {},
"outputs": [],
"source": [
"affiliations = wos.groupby(record_col)[\"Affiliations\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
"affiliations[\"Affiliations\"] = affiliations[\"Affiliations\"].str.strip().str.upper().fillna(\"UNKNOWN\")\n",
"affiliations = affiliations.drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": 121,
"outputs": [
{
"data": {
"text/plain": "Affiliations\nCHINESE ACADEMY OF SCIENCES 3606\nUNIVERSITY OF LONDON 1725\nUDICE-FRENCH RESEARCH UNIVERSITIES 1421\nCENTRE NATIONAL DE LA RECHERCHE SCIENTIFIQUE (CNRS) 1330\nTSINGHUA UNIVERSITY 1330\n ... \nUNIVERSITY OF NATIONAL & WORLD ECONOMICS - BULGARIA 1\nCENTRE HOSPITALIER RENE DUBOS, PONTOISE 1\nUNIVERSITY OF PRINCE MUGRIN 1\nMINDANAO STATE UNIVERSITY-IIT 1\nTANGSHAN UNIVERSITY 1\nName: count, Length: 6772, dtype: int64"
},
"execution_count": 121,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"affiliations[\"Affiliations\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 122,
"outputs": [
{
"data": {
"text/plain": "Institution\nChinese Acad Sci 3600\nTsinghua Univ 1611\nShanghai Jiao Tong Univ 1359\nZhejiang Univ 1274\nUniv Elect Sci & Technol China 965\n ... \nStatSol 1\nJan Kochanowski Univ Humanities & Sci 1\nTomTom 1\nLUMC 1\nInt Digital Econ Acad 1\nName: count, Length: 14564, dtype: int64"
},
"execution_count": 122,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"univ_locations[\"Institution\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 123,
"outputs": [
{
"data": {
"text/plain": "31574"
},
"execution_count": 123,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"univ_locations[record_col].nunique()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 124,
"outputs": [
{
"data": {
"text/plain": "31574"
},
"execution_count": 124,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"affiliations[record_col].nunique()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 125,
"outputs": [
{
"data": {
"text/plain": "137536"
},
"execution_count": 125,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"univ_locations[\"Institution\"].value_counts().sum()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 126,
"outputs": [
{
"data": {
"text/plain": "181023"
},
"execution_count": 126,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"affiliations[\"Affiliations\"].value_counts().sum()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 127,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "WoS Categories\n Engineering, Electrical & Electronic 6006\nComputer Science, Artificial Intelligence 4769\nComputer Science, Information Systems 3698\n Telecommunications 3271\nEngineering, Electrical & Electronic 2423\n ... \nAndrology 1\n Criminology & Penology 1\nArea Studies 1\nArt 1\n Geology 1\nName: count, Length: 415, dtype: int64"
},
"execution_count": 127,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos_cat = wos.groupby(record_col)[\"WoS Categories\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
"wos_cat[\"WoS Categories\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 128,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "Research Areas\nEngineering 12704\nComputer Science 12221\nTelecommunications 3544\nImaging Science & Photographic Technology 1936\nEnvironmental Sciences & Ecology 1876\n ... \nMusic 1\nAsian Studies 1\nCultural Studies 1\nArea Studies 1\nEmergency Medicine 1\nName: count, Length: 145, dtype: int64"
},
"execution_count": 128,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos_areas = wos.groupby(record_col)[\"Research Areas\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
"wos_areas[\"Research Areas\"] = wos_areas[\"Research Areas\"].str.strip()\n",
"wos_areas[\"Research Areas\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 129,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "['Domain_English', 'Field_English', 'SubField_English']"
},
"execution_count": 129,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[c for c in wos.columns if \"_English\" in c]"
]
},
{
"cell_type": "code",
"execution_count": 130,
"metadata": {},
"outputs": [],
"source": [
"metrix_levels = [c for c in wos.columns if \"_English\" in c]\n",
"for m in metrix_levels:\n",
" wos[m] = wos[m].replace({\"article-level classification\":\"Miscellaneous\"})\n"
]
},
{
"cell_type": "code",
"execution_count": 130,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 131,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": " Publication Type Authors \n0 J Maurya, S; Srivastava, PK; Yaduvanshi, A; Anan... \\\n38775 J Huang, BS; Zheng, GY; Xu, ZY; Rao, SX; Wang, SL \n38758 J Wen, SH; Hu, XH; Li, Z; Lam, HK; Sun, FC; Fang, B \n38760 J Yu, WR; McCann, J; Zhang, CY \n38762 J Zhang, AZ; Sun, GY; Liu, SH; Wang, ZJ; Wang, P... \n... ... ... \n41597 J Liu, H; Long, SX; Pinson, SRM; Tang, Z; Guerin... \n41666 J Komarizadehasl, S; Mobaraki, B; Ma, HY; Lozano... \n41621 J Xie, QH; Wang, JF; Liao, CH; Shang, JL; Lopez-... \n14505 J Li, RYM; Li, HCY \n41622 J Shen, YF; Wang, TZ; Amirat, Y; Chen, GD \n\n Book Authors Book Editors Book Group Authors \n0 NaN NaN NaN \\\n38775 NaN NaN NaN \n38758 NaN NaN NaN \n38760 NaN NaN NaN \n38762 NaN NaN NaN \n... ... ... ... \n41597 NaN NaN NaN \n41666 NaN NaN NaN \n41621 NaN NaN NaN \n14505 NaN NaN NaN \n41622 NaN NaN NaN \n\n Author Full Names \n0 Maurya, Swati; Srivastava, Prashant K.; Yaduva... \\\n38775 Huang, Bingsheng; Zheng, Guoyan; Xu, Ziyue; Ra... \n38758 Wen, Shuhuan; Hu, Xueheng; Li, Zhen; Lam, Hak ... \n38760 Yu, Weiren; McCann, Julie; Zhang, Chengyuan \n38762 Zhang, Ai Zhu; Sun, Gen Yun; Liu, Si Han; Wang... \n... ... \n41597 Liu, Huan; Long, Su-Xian; Pinson, Shannon R. M... \n41666 Komarizadehasl, Seyedmilad; Mobaraki, Behnam; ... \n41621 Xie, Qinghua; Wang, Jinfei; Liao, Chunhua; Sha... \n14505 Li, Rita Yi Man; Li, Herru Ching Yu \n41622 Shen, Yifei; Wang, Tianzhen; Amirat, Yassine; ... \n\n Book Author Full Names Group Authors \n0 NaN NaN \\\n38775 NaN NaN \n38758 NaN NaN \n38760 NaN NaN \n38762 NaN NaN \n... ... ... \n41597 NaN NaN \n41666 NaN NaN \n41621 NaN NaN \n14505 NaN NaN \n41622 NaN NaN \n\n Article Title \n0 Soil erosion in future scenario using CMIP5 mo... \\\n38775 Application of Image Processing Techniques in ... \n38758 NAO robot obstacle avoidance based on fuzzy Q-... \n38760 Efficient Pairwise Penetrating-rank Similarity... \n38762 Multi-scale segmentation of very high resoluti... \n... ... \n41597 Univariate and Multivariate QTL Analyses Revea... \n41666 Development of a Low-Cost System for the Accur... \n41621 On the Use of Neumann Decomposition for Crop C... \n14505 Have Housing Prices Gone with the Smelly Wind?... \n41622 IGBT Open-Circuit Fault Diagnosis for MMC Subm... \n\n Source Title ... \n0 JOURNAL OF HYDROLOGY ... \\\n38775 CONTRAST MEDIA & MOLECULAR IMAGING ... \n38758 INDUSTRIAL ROBOT-THE INTERNATIONAL JOURNAL OF ... ... \n38760 ACM TRANSACTIONS ON THE WEB ... \n38762 MULTIMEDIA TOOLS AND APPLICATIONS ... \n...
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Publication Type</th>\n <th>Authors</th>\n <th>Book Authors</th>\n <th>Book Editors</th>\n <th>Book Group Authors</th>\n <th>Author Full Names</th>\n <th>Book Author Full Names</th>\n <th>Group Authors</th>\n <th>Article Title</th>\n <th>Source Title</th>\n <th>...</th>\n <th>UT (Unique WOS ID)</th>\n <th>issn_var</th>\n <th>issn</th>\n <th>Domain_English</th>\n <th>Field_English</th>\n <th>SubField_English</th>\n <th>2.00 SEQ</th>\n <th>Source_title</th>\n <th>srcid</th>\n <th>issn_type</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>J</td>\n <td>Maurya, S; Srivastava, PK; Yaduvanshi, A; Anan...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Maurya, Swati; Srivastava, Prashant K.; Yaduva...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Soil erosion in future scenario using CMIP5 mo...</td>\n <td>JOURNAL OF HYDROLOGY</td>\n <td>...</td>\n <td>WOS:000641589600020</td>\n <td>issn</td>\n <td>00221694</td>\n <td>Applied Sciences</td>\n <td>Engineering</td>\n <td>Environmental Engineering</td>\n <td>25</td>\n <td>Journal of Hydrology</td>\n <td>5.008900e+04</td>\n <td>issn1</td>\n </tr>\n <tr>\n <th>38775</th>\n <td>J</td>\n <td>Huang, BS; Zheng, GY; Xu, ZY; Rao, SX; Wang, SL</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Huang, Bingsheng; Zheng, Guoyan; Xu, Ziyue; Ra...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Application of Image Processing Techniques in ...</td>\n <td>CONTRAST MEDIA &amp; MOLECULAR IMAGING</td>\n <td>...</td>\n <td>WOS:000416383700001</td>\n <td>issn</td>\n <td>15554309</td>\n <td>Health Sciences</td>\n <td>Clinical Medicine</td>\n <td>Nuclear Medicine &amp; Medical Imaging</td>\n <td>111</td>\n <td>Contrast Media and Molecular Imaging</td>\n <td>5.400153e+09</td>\n <td>issn1</td>\n </tr>\n <tr>\n <th>38758</th>\n <td>J</td>\n <td>Wen, SH; Hu, XH; Li, Z; Lam, HK; Sun, FC; Fang, B</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Wen, Shuhuan; Hu, Xueheng; Li, Zhen; Lam, Hak ...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NAO robot obstacle avoidance based on fuzzy Q-...</td>\n <td>INDUSTRIAL ROBOT-THE INTERNATIONAL JOURNAL OF ...</td>\n <td>...</td>\n <td>WOS:000590197400003</td>\n <td>issn</td>\n <td>0143991x</td>\n <td>Applied Sciences</td>\n <td>Engineering</td>\n <td>Industrial Engineering &amp; Automation</td>\n <td>27</td>\n <td>Industrial Robot</td>\n <td>1.804700e+04</td>\n <td>issn1</td>\n </tr>\n <tr>\n <th>38760</th>\n <td>J</td>\n <td>Yu, WR; McCann, J; Zhang, CY</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Yu, Weiren; McCann, Julie; Zhang, Chengyuan</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Efficient Pairwise Penetrating-rank Similarity...</td>\n <td>ACM TRANSACTIONS ON THE WEB</td>\n <td>...</td>\n <td>WOS:000510863400004</td>\n <td>issn</td>\n <td>15591131</td>\n <td>Applied Sciences</td>\n <td>Information &amp; Communication Technologies</td>\n <td>Information Systems</td>\n <td>35</td>\n <td>ACM Transactions on the Web</td>\n <td>5.800207e+09</td>\n <td>issn1</td>\n </tr>\n <tr>\n <th>38762</th>\n <td>J</td>\n <td>Zhang, AZ; Sun, GY; Liu, SH; Wang, ZJ; Wang, P...</td>\n <td>NaN</td>\n
},
"execution_count": 131,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos"
]
},
{
"cell_type": "code",
"execution_count": 132,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "['Domain_English', 'Field_English', 'SubField_English']"
},
"execution_count": 132,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"metrix_levels"
]
},
{
"cell_type": "code",
"execution_count": 134,
"outputs": [],
"source": [
"record_countries = locations[[record_col,\"Country\"]].drop_duplicates()\n",
"record_author_locations = author_locations[[record_col,\"author_str_id\",\"Country\"]].drop_duplicates()\n",
"record_institution = univ_locations[[record_col,\"Institution\",\"Country\"]].drop_duplicates()\n",
"country_types = locations[[\"Country\",\"Country_Type\"]].drop_duplicates()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 135,
"outputs": [],
"source": [
"# Basic network layout"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 136,
"outputs": [],
"source": [
"country_collabs = record_countries.merge(record_countries, on=record_col)\n",
"country_collabs = country_collabs[country_collabs[\"Country_x\"]!=country_collabs[\"Country_y\"]]\n",
"country_collabs[\"weight\"] = 0.5"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 137,
"outputs": [],
"source": [
"inst_collabs = record_institution.merge(record_institution, on=record_col)\n",
"inst_collabs = inst_collabs[inst_collabs[\"Institution_x\"]!=inst_collabs[\"Institution_y\"]]\n",
"inst_collabs[\"weight\"] = 0.5"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 138,
"outputs": [
{
"data": {
"text/plain": "Index(['Publication Type', 'Authors', 'Book Authors', 'Book Editors',\n 'Book Group Authors', 'Author Full Names', 'Book Author Full Names',\n 'Group Authors', 'Article Title', 'Source Title', 'Book Series Title',\n 'Book Series Subtitle', 'Language', 'Document Type', 'Conference Title',\n 'Conference Date', 'Conference Location', 'Conference Sponsor',\n 'Conference Host', 'Author Keywords', 'Keywords Plus', 'Abstract',\n 'Addresses', 'Affiliations', 'Reprint Addresses', 'Email Addresses',\n 'Researcher Ids', 'ORCIDs', 'Funding Orgs', 'Funding Name Preferred',\n 'Funding Text', 'Cited References', 'Cited Reference Count',\n 'Times Cited, WoS Core', 'Times Cited, All Databases',\n '180 Day Usage Count', 'Since 2013 Usage Count', 'Publisher',\n 'Publisher City', 'Publisher Address', 'ISSN', 'eISSN', 'ISBN',\n 'Journal Abbreviation', 'Journal ISO Abbreviation', 'Publication Date',\n 'Publication Year', 'Volume', 'Issue', 'Part Number', 'Supplement',\n 'Special Issue', 'Meeting Abstract', 'Start Page', 'End Page',\n 'Article Number', 'DOI', 'DOI Link', 'Book DOI', 'Early Access Date',\n 'Number of Pages', 'WoS Categories', 'Web of Science Index',\n 'Research Areas', 'IDS Number', 'Pubmed Id', 'Open Access Designations',\n 'Highly Cited Status', 'Hot Paper Status', 'Date of Export',\n 'UT (Unique WOS ID)', 'issn_var', 'issn', 'Domain_English',\n 'Field_English', 'SubField_English', '2.00 SEQ', 'Source_title',\n 'srcid', 'issn_type'],\n dtype='object')"
},
"execution_count": 138,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos.columns"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 139,
"outputs": [
{
"data": {
"text/plain": "['Authors',\n 'Book Authors',\n 'Book Editors',\n 'Book Group Authors',\n 'Author Full Names',\n 'Book Author Full Names',\n 'Group Authors',\n 'Addresses',\n 'Reprint Addresses',\n 'Email Addresses',\n 'Researcher Ids',\n 'ORCIDs',\n 'Publisher Address',\n '2.00 SEQ']"
},
"execution_count": 139,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"drop_cols = [ws for ws in wos.columns if ((\"uthor\" in ws or \"ddress\" in ws or \"ORCID\" in\n",
" ws or \"esearcher\" in ws or \"ditor\" in ws or \"name\" in ws or 'SEQ' in ws) and \"eyword\" not in ws)]\n",
"drop_cols"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 19,
"outputs": [],
"source": [
"outdir=\"wos_processed_data\""
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 140,
"outputs": [],
"source": [
"os.makedirs(outdir, exist_ok=True)\n",
"\n",
"wos.drop(columns=drop_cols).to_excel(f\"{outdir}/wos_processed.xlsx\", index=False)\n",
"\n",
"record_countries.to_excel(f\"{outdir}/wos_countries.xlsx\", index=False)\n",
"\n",
"record_author_locations.to_excel(f\"{outdir}/wos_author_locations.xlsx\", index=False)\n",
"\n",
"record_institution.to_excel(f\"{outdir}/wos_institution_locations.xlsx\", index=False)\n",
"\n",
"kw_df.to_excel(f\"{outdir}/wos_keywords.xlsx\", index=False)\n",
"\n",
"country_types.to_excel(f\"{outdir}/wos_country_types.xlsx\", index=False)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 141,
"outputs": [],
"source": [
"wos.drop(columns=drop_cols).to_csv(f\"{outdir}/wos_processed.csv\", index=False, sep='\\t')\n",
"\n",
"record_countries.to_csv(f\"{outdir}/wos_countries.csv\", index=False, sep='\\t')\n",
"\n",
"record_author_locations.to_csv(f\"{outdir}/wos_author_locations.csv\", index=False, sep='\\t')\n",
"\n",
"record_institution.to_csv(f\"{outdir}/wos_institution_locations.csv\", index=False, sep='\\t')\n",
"\n",
"kw_df.to_csv(f\"{outdir}/wos_keywords.csv\", index=False, sep='\\t')\n",
"\n",
"country_types.to_csv(f\"{outdir}/wos_country_types.csv\", index=False, sep='\\t')\n",
"\n",
"inst_collabs.to_csv(f\"{outdir}/wos_inst_collabs.csv\", index=False, sep='\\t')\n",
"\n",
"country_collabs.to_csv(f\"{outdir}/wos_country_collabs.csv\", index=False, sep='\\t')"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 20,
"outputs": [],
"source": [
"wos_areas.to_csv(f\"{outdir}/wos_research_areas.csv\", index=False, sep='\\t')\n",
"\n",
"wos_subcat.to_csv(f\"{outdir}/wos_categories.csv\", index=False, sep='\\t')"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 151,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) \n697 WOS:000290510900023 \\\n871 WOS:000291698400013 \n1127 WOS:000291752600003 \n1470 WOS:000294492600001 \n1772 WOS:000295615800053 \n... ... \n211125 WOS:000926330000001 \n211658 WOS:000929537500051 \n211686 WOS:000929537500051 \n211719 WOS:000929537500051 \n212266 WOS:000929737300001 \n\n Authors_of_address \n697 Liu, Jian-Guo \\\n871 Abdesselam, A.; Barr, A. J.; Beauchemin, P. H.... \n1127 Hill, Jamie R.; Kelm, Sebastian; Deane, Charlo... \n1470 Barr, A. J.; Heinemann, F. E. W.; de Renstrom,... \n1772 Huang, Xiaolei \n... ... \n211125 Wang, Tingyan \n211658 Lewycka, Sonia \n211686 Maude, Richard James \n211719 Moore, Catrin E. \n212266 Matthews, Philippa C. \n\n Address Country \n697 Univ Oxford, CABDyN Complex Ctr, Said Business... United Kingdom \\\n871 Univ Oxford, Dept Phys, Oxford OX1 3RH, England United Kingdom \n1127 Univ Oxford, Dept Stat, Oxford OX1 3TG, England United Kingdom \n1470 Univ Oxford, Dept Phys, Oxford OX1 3RH, England United Kingdom \n1772 Univ Oxford, Oxford OX1 2JD, England United Kingdom \n... ... ... \n211125 Univ Oxford, Nuffield Dept Med, Oxford, England United Kingdom \n211658 Univ Oxford, Ctr Trop Med & Global Hlth, Oxfor... United Kingdom \n211686 Univ Oxford, Nuffield Dept Med, Oxford, England United Kingdom \n211719 Univ Oxford, Big Data Inst, Oxford, England United Kingdom \n212266 Univ Oxford, Nuffield Dept Expt Med, Oxford, E... United Kingdom \n\n City Country_Type \n697 Oxford Non-EU associate \n871 Oxford Non-EU associate \n1127 Oxford Non-EU associate \n1470 Oxford Non-EU associate \n1772 Oxford Non-EU associate \n... ... ... \n211125 Meda Non-EU associate \n211658 Meda Non-EU associate \n211686 Meda Non-EU associate \n211719 Biga Non-EU associate \n212266 Meda Non-EU associate \n\n[789 rows x 6 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Authors_of_address</th>\n <th>Address</th>\n <th>Country</th>\n <th>City</th>\n <th>Country_Type</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>697</th>\n <td>WOS:000290510900023</td>\n <td>Liu, Jian-Guo</td>\n <td>Univ Oxford, CABDyN Complex Ctr, Said Business...</td>\n <td>United Kingdom</td>\n <td>Oxford</td>\n <td>Non-EU associate</td>\n </tr>\n <tr>\n <th>871</th>\n <td>WOS:000291698400013</td>\n <td>Abdesselam, A.; Barr, A. J.; Beauchemin, P. H....</td>\n <td>Univ Oxford, Dept Phys, Oxford OX1 3RH, England</td>\n <td>United Kingdom</td>\n <td>Oxford</td>\n <td>Non-EU associate</td>\n </tr>\n <tr>\n <th>1127</th>\n <td>WOS:000291752600003</td>\n <td>Hill, Jamie R.; Kelm, Sebastian; Deane, Charlo...</td>\n <td>Univ Oxford, Dept Stat, Oxford OX1 3TG, England</td>\n <td>United Kingdom</td>\n <td>Oxford</td>\n <td>Non-EU associate</td>\n </tr>\n <tr>\n <th>1470</th>\n <td>WOS:000294492600001</td>\n <td>Barr, A. J.; Heinemann, F. E. W.; de Renstrom,...</td>\n <td>Univ Oxford, Dept Phys, Oxford OX1 3RH, England</td>\n <td>United Kingdom</td>\n <td>Oxford</td>\n <td>Non-EU associate</td>\n </tr>\n <tr>\n <th>1772</th>\n <td>WOS:000295615800053</td>\n <td>Huang, Xiaolei</td>\n <td>Univ Oxford, Oxford OX1 2JD, England</td>\n <td>United Kingdom</td>\n <td>Oxford</td>\n <td>Non-EU associate</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>211125</th>\n <td>WOS:000926330000001</td>\n <td>Wang, Tingyan</td>\n <td>Univ Oxford, Nuffield Dept Med, Oxford, England</td>\n <td>United Kingdom</td>\n <td>Meda</td>\n <td>Non-EU associate</td>\n </tr>\n <tr>\n <th>211658</th>\n <td>WOS:000929537500051</td>\n <td>Lewycka, Sonia</td>\n <td>Univ Oxford, Ctr Trop Med &amp; Global Hlth, Oxfor...</td>\n <td>United Kingdom</td>\n <td>Meda</td>\n <td>Non-EU associate</td>\n </tr>\n <tr>\n <th>211686</th>\n <td>WOS:000929537500051</td>\n <td>Maude, Richard James</td>\n <td>Univ Oxford, Nuffield Dept Med, Oxford, England</td>\n <td>United Kingdom</td>\n <td>Meda</td>\n <td>Non-EU associate</td>\n </tr>\n <tr>\n <th>211719</th>\n <td>WOS:000929537500051</td>\n <td>Moore, Catrin E.</td>\n <td>Univ Oxford, Big Data Inst, Oxford, England</td>\n <td>United Kingdom</td>\n <td>Biga</td>\n <td>Non-EU associate</td>\n </tr>\n <tr>\n <th>212266</th>\n <td>WOS:000929737300001</td>\n <td>Matthews, Philippa C.</td>\n <td>Univ Oxford, Nuffield Dept Expt Med, Oxford, E...</td>\n <td>United Kingdom</td>\n <td>Meda</td>\n <td>Non-EU associate</td>\n </tr>\n </tbody>\n</table>\n<p>789 rows × 6 columns</p>\n</div>"
},
"execution_count": 151,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"locations[locations[\"Address\"].str.contains(\"Univ Oxford\")]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 142,
"outputs": [],
"source": [
"inv = record_institution.groupby(\"Institution\")[\"Country\"].value_counts().reset_index(level=[0,1])"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 147,
"outputs": [
{
"data": {
"text/plain": "Empty DataFrame\nColumns: [UT (Unique WOS ID), Address, Country, City, Country_Type, Institution]\nIndex: []",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Address</th>\n <th>Country</th>\n <th>City</th>\n <th>Country_Type</th>\n <th>Institution</th>\n </tr>\n </thead>\n <tbody>\n </tbody>\n</table>\n</div>"
},
"execution_count": 147,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"univ_locations[univ_locations[\"Address\"].str.strip().str.strip(\";\").str.contains(\";\")]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 146,
"outputs": [
{
"data": {
"text/plain": " Institution Country count\n95 Univ Oxford United Kingdom 1\n3125 Dept Engn Univ Oxford United Kingdom 1\n13397 Univ Oxford United Kingdom 629\n13398 Univ Oxford China 2",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Institution</th>\n <th>Country</th>\n <th>count</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>95</th>\n <td>Univ Oxford</td>\n <td>United Kingdom</td>\n <td>1</td>\n </tr>\n <tr>\n <th>3125</th>\n <td>Dept Engn Univ Oxford</td>\n <td>United Kingdom</td>\n <td>1</td>\n </tr>\n <tr>\n <th>13397</th>\n <td>Univ Oxford</td>\n <td>United Kingdom</td>\n <td>629</td>\n </tr>\n <tr>\n <th>13398</th>\n <td>Univ Oxford</td>\n <td>China</td>\n <td>2</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 146,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"inv[inv[\"Institution\"].str.contains(\"Univ Oxford\")]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 143,
"outputs": [
{
"data": {
"text/plain": "Institution\n Aalto Univ 1\n Aix Marseille Univ 1\n Alexandru Ioan Cuza Univ 1\n Av Rovisco Pais 1 1\n Brandenburg Tech Univ Cottbus 1\n ..\niMinds 1\niOLAP Inc 1\nneuroCare Grp 1\nsen Univ Guangzhou 1\nvon Hoerner & Sulger GmbH 1\nName: Country, Length: 14564, dtype: int64"
},
"execution_count": 143,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"record_institution.groupby(\"Institution\")[\"Country\"].nunique()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"# Simple NLP part"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 32,
"outputs": [
{
"data": {
"text/plain": "<Axes: ylabel='Frequency'>"
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAk0AAAGdCAYAAAAPLEfqAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAs8klEQVR4nO3de3RU5b3/8U8gJAQkE25JyI8AqSCXgnK4NEaRlpJDkGjl0h6psaCkUDRREBGDl1TFNhAVAbWkHhFwCYqcA0jhgMSgcKoRJBq5VANoMFAygRqS4SIhJPv3Byd7MSTAQxiYyfB+rbXXYvZ+5pnvN5u95rP27NkTYFmWJQAAAFxQI28XAAAA0BAQmgAAAAwQmgAAAAwQmgAAAAwQmgAAAAwQmgAAAAwQmgAAAAwQmgAAAAwEersAf1FdXa2DBw+qRYsWCggI8HY5AADAgGVZOnr0qKKiotSo0YXPJRGaPOTgwYOKjo72dhkAAKAe9u/fr/bt219wDKHJQ1q0aCHpzB89NDTUy9UAAAATLpdL0dHR9vv4hRCaPKTmI7nQ0FBCEwAADYzJpTVcCA4AAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGAg0NsFALj2dEpb6+0SLtm+mYneLgGAl3GmCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwIBXQ9PmzZt15513KioqSgEBAVq1apXbdsuylJ6ernbt2ikkJETx8fHas2eP25jS0lIlJSUpNDRUYWFhSk5O1rFjx9zGbN++XbfddpuaNm2q6OhoZWZm1qpl+fLl6tatm5o2bapevXrpf/7nfzzeLwAAaLi8GpqOHz+um266Sa+99lqd2zMzMzVv3jxlZWVpy5Ytat68uRISEnTy5El7TFJSknbt2qXs7GytWbNGmzdv1oQJE+ztLpdLQ4YMUceOHZWXl6cXXnhBzzzzjF5//XV7zKeffqrf/va3Sk5O1pdffqnhw4dr+PDh2rlz55VrHgAANCgBlmVZ3i5CkgICArRy5UoNHz5c0pmzTFFRUXr00Uc1depUSVJ5ebkiIiK0aNEijR49Wl9//bV69Oihzz//XP369ZMkrV+/XsOGDdOBAwcUFRWl+fPn68knn5TT6VRQUJAkKS0tTatWrdI333wjSbr77rt1/PhxrVmzxq7n5ptvVu/evZWVlWVUv8vlksPhUHl5uUJDQz31ZwH8Uqe0td4u4ZLtm5no7RIAXAGX8v7ts9c0FRYWyul0Kj4+3l7ncDgUGxur3NxcSVJubq7CwsLswCRJ8fHxatSokbZs2WKPGThwoB2YJCkhIUEFBQU6cuSIPebs16kZU/M6damoqJDL5XJbAACA//LZ0OR0OiVJERERbusjIiLsbU6nU+Hh4W7bAwMD1apVK7cxdc1x9mucb0zN9rpkZGTI4XDYS3R09KW2CAAAGhCfDU2+bvr06SovL7eX/fv3e7skAABwBflsaIqMjJQklZSUuK0vKSmxt0VGRurQoUNu20+fPq3S0lK3MXXNcfZrnG9Mzfa6BAcHKzQ01G0BAAD+y2dDU0xMjCIjI5WTk2Ovc7lc2rJli+Li4iRJcXFxKisrU15enj1m48aNqq6uVmxsrD1m8+bNqqystMdkZ2era9euatmypT3m7NepGVPzOgAAAF4NTceOHVN+fr7y8/Mlnbn4Oz8/X0VFRQoICNDkyZP1/PPPa/Xq1dqxY4fGjBmjqKgo+xt23bt319ChQzV+/Hht3bpVn3zyiVJTUzV69GhFRUVJku655x4FBQUpOTlZu3bt0rJlyzR37lxNmTLFrmPSpElav369XnrpJX3zzTd65plntG3bNqWmpl7tPwkAAPBRgd588W3btmnQoEH245ogM3bsWC1atEjTpk3T8ePHNWHCBJWVlWnAgAFav369mjZtaj9nyZIlSk1N1eDBg9WoUSONGjVK8+bNs7c7HA5t2LBBKSkp6tu3r9q0aaP09HS3ezndcsstWrp0qZ566ik98cQT6tKli1atWqWePXtehb8CAABoCHzmPk0NHfdpAsxxnyYAvsIv7tMEAADgSwhNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABgK9XQAANASd0tZ6u4RLtm9mordLAPwKZ5oAAAAMEJoAAAAMEJoAAAAMEJoAAAAMEJoAAAAMEJoAAAAMEJoAAAAMEJoAAAAMEJoAAAAMEJoAAAAMEJoAAAAMEJoAAAAMEJoAAAAMEJoAAAAMEJoAAAAMEJoAAAAM+HRoqqqq0tNPP62YmBiFhITo+uuv14wZM2RZlj3Gsiylp6erXbt2CgkJUXx8vPbs2eM2T2lpqZKSkhQaGqqwsDAlJyfr2LFjbmO2b9+u2267TU2bNlV0dLQyMzOvSo8AAKBh8OnQNGvWLM2fP1+vvvqqvv76a82aNUuZmZl65ZVX7DGZmZmaN2+esrKytGXLFjVv3lwJCQk6efKkPSYpKUm7du1Sdna21qxZo82bN2vChAn2dpfLpSFDhqhjx47Ky8vTCy+8oGeeeUavv/76Ve0XAAD4rgDr7NM2PuaOO+5QRESEFixYYK8bNWqUQkJC9Pbbb8uyLEVFRenRRx/V1KlTJUnl5eWKiIjQokWLNHr0aH399dfq0aOHPv/8c/Xr10+StH79eg0bNkwHDhxQVFSU5s+fryeffFJOp1NBQUGSpLS0NK1atUrffPONUa0ul0sOh0Pl5eUKDQ318F8C8C+d0tZ6u4Rrwr6Zid4uAfB5l/L+7dNnmm655Rbl5ORo9+7dkqSvvvpKf//733X77bdLkgoLC+V0OhUfH28/x+FwKDY2Vrm5uZKk3NxchYWF2YFJkuLj49WoUSNt2bLFHjNw4EA7MElSQkKCCgoKdOTIkTprq6iokMvlclsAAID/CvR2AReSlpYml8ulbt26qXHjxqqqqtKf/vQnJSUlSZKcTqckKSIiwu15ERER9jan06nw8HC37YGBgWrVqpXbmJiYmFpz1Gxr2bJlrdoyMjL07LPPeqBLAADQEPj0mab33ntPS5Ys0dKlS/XFF19o8eLFevHFF7V48WJvl6bp06ervLzcXvbv3+/tkgAAwBXk02eaHnvsMaWlpWn06NGSpF69eun7779XRkaGxo4dq8jISElSSUmJ2rVrZz+vpKREvXv3liRFRkbq0KFDbvOePn1apaWl9vMjIyNVUlLiNqbmcc2YcwUHBys4OPjymwQAAA2CT59pOnHihBo1ci+xcePGqq6uliTFxMQoMjJSOTk59naXy6UtW7YoLi5OkhQXF6eysjLl5eXZYzZu3Kjq6mrFxsbaYzZv3qzKykp7THZ2trp27VrnR3MAAODa49Oh6c4779Sf/vQnrV27Vvv27dPKlSs1e/ZsjRgxQpIUEBCgyZMn6/nnn9fq1au1Y8cOjRkzRlFRURo+fLgkqXv37ho6dKjGjx+vrVu36pNPPlFqaqpGjx6tqKgoSdI999yjoKAgJScna9euXVq2bJnmzp2rKVOmeKt1AADgY3z647lXXnlFTz/9tB588EEdOnRIUVFR+sMf/qD09HR7zLRp03T8+HFNmDBBZWV
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import spacy\n",
"\n",
"nlp = spacy.load(\"en_core_web_lg\")\n",
"wos_nlp = wos.merge(wos_kwd_concat, on=record_col)\n",
"wos_nlp[\"Document\"] = wos_nlp[\"Article Title\"].str.cat(wos_nlp[[\"Abstract\", \"keyword_all\"]].fillna(\"\"), sep=' - ')\n",
"# wos_kwd_test[\"BERT_KWDS\"] = wos_kwd_test[\"Document\"].map(kwd_extract)\n",
"\n",
"vectors = list()\n",
"vector_norms = list()\n",
"\n",
"for doc in nlp.pipe(wos_nlp['Document'].astype('unicode').values, batch_size=300,\n",
" n_process=4):\n",
" vectors.append(doc.vector)\n",
" vector_norms.append(doc.vector_norm)\n",
"\n",
"wos_nlp['vector'] = vectors\n",
"wos_nlp['vector_norm'] = vector_norms\n",
"wos_nlp['vector_norm'].plot(kind=\"hist\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 35,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) TNSE-X TNSE-Y\n0 WOS:000641589600020 131.783783 -4.202979\n1 WOS:000590197400003 74.897812 89.280334\n2 WOS:000510863400004 84.939049 23.416033\n3 WOS:000403039400031 -39.527546 54.230900\n4 WOS:000439363600016 -59.109379 72.877693",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>TNSE-X</th>\n <th>TNSE-Y</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000641589600020</td>\n <td>131.783783</td>\n <td>-4.202979</td>\n </tr>\n <tr>\n <th>1</th>\n <td>WOS:000590197400003</td>\n <td>74.897812</td>\n <td>89.280334</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000510863400004</td>\n <td>84.939049</td>\n <td>23.416033</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000403039400031</td>\n <td>-39.527546</td>\n <td>54.230900</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000439363600016</td>\n <td>-59.109379</td>\n <td>72.877693</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.manifold import TSNE\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"# % matplotlib inline\n",
"\n",
"vector_data = pd.DataFrame(wos_nlp[\"vector\"].to_list(), index=wos_nlp[record_col]).reset_index()\n",
"vector_data.head()\n",
"\n",
"labels = vector_data.values[:, 0]\n",
"record_vectors = vector_data.values[:, 1:]\n",
"\n",
"tsne_model = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, random_state=42, metric='cosine')\n",
"tnse_2d = tsne_model.fit_transform(record_vectors)\n",
"tnse_data = pd.DataFrame(tnse_2d, index=labels).reset_index()\n",
"tnse_data.columns = [record_col, \"TNSE-X\", \"TNSE-Y\"]\n",
"tnse_data.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 36,
"outputs": [
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1MAAAGwCAYAAABM9z+ZAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3gUxRvA8e/lUi6X3nslFVIIvYcqHQQERBARQVEBFVHEhqCCDUHFDoJIBwHpvXcSSIGQ3nvvyeXK/v44OIgJGFRsv/08Tx643dmd2c3l7t6bmXckgiAIiEQikUgkEolEIpHovuj93Q0QiUQikUgkEolEon8jMZgSiUQikUgkEolEot9BDKZEIpFIJBKJRCKR6HcQgymRSCQSiUQikUgk+h3EYEokEolEIpFIJBKJfgcxmBKJRCKRSCQSiUSi30EMpkQikUgkEolEIpHod9D/uxvwb6PRaMjNzcXMzAyJRPJ3N0ckEolEIlELCIJAVVUVzs7O6Ok9+O+S1Wo1SqXygdcjEon+XAYGBkil0haXF4Op+5Sbm4ubm9vf3QyRSCQSiUS/Q1ZWFq6urg/s/IIgkJ+fT3l5+QOrQyQSPViWlpY4Ojq2qONEDKbuk5mZGaB9MTY3N/+bWyMSiUQikaglKisrcXNz072PPyi3Ail7e3vkcrk4ikUk+hcRBIHa2loKCwsBcHJy+s1jxGDqPt16UTQ3NxeDKZFIJBKJ/mUeZHCjVqt1gZSNjc0Dq0ckEj04xsbGABQWFmJvb/+bQ/7EBBQikUgkEolEf4Jbc6Tkcvnf3BKRSPRH3Pobbsm8RzGYEolEIpFIJPoTiUP7RKJ/t/v5GxaDKZFIJBKJRCKRSCT6HcRgSiQSiUQikUgkEol+BzGYEolEIpFIJBL9Zd555x3atm2rezxlyhQefvjhP3TOEydOIJFIHmhK+j+jnaL/HjGYEolEIpFIJPqHUWsEzqeU8EtUDudTSlBrhL+k3vPnzyOVShk6dOhfUh/AZ599xpo1ax54PdHR0YwYMQJ7e3tkMhmenp6MHz9elwb7t/xV7RT9u4ip0UUikUgkEon+QQ5cy2Ph7jjyKup125wsZCwY3ppBQb+97s0fsWrVKmbNmsWqVavIzc3F2dn5gdYHYGFh8cDrKCoqol+/fgwbNoyDBw9iaWlJeno6u3btoqampkXn+CvaKfr3EXumRCKRSCQSif4hDlzL49l1VxoFUgD5FfU8u+4KB67lPbC6q6ur2bx5M88++yxDhw5t0gtzayjd3r17CQkJQSaT0aVLF65du6Yrs2bNGiwtLdm5cye+vr7IZDIGDhxIVlbWXev99fA5jUbDkiVL8PLywtjYmNDQULZt29bomH379uHn54exsTF9+vQhPT39ntd29uxZKioqWLlyJWFhYXh5edGnTx+WLVuGl5eXrtz169cZNmwY5ubmmJmZ0bNnT1JSUn5XO2/dr6NHj9KhQwfkcjndunUjISGhUdt2795Nx44dkclk2NraMmrUKN0+hULB3LlzcXFxwcTEhM6dO3PixAnd/oyMDIYPH46VlRUmJia0adOGffv23fNeiP5cYjAlEolEIpFI9A+g1ggs3B1HcwP6bm1buDvugQ3527JlCwEBAfj7+zNp0iR++OEHBKFpXa+88gpLly7l8uXL2NnZMXz48Ebr8dTW1vL++++zdu1azp49S3l5OY8++miL27FkyRLWrl3LN998w/Xr13nppZeYNGkSJ0+eBCArK4vRo0czfPhwoqKimDZtGq+99to9z+no6IhKpWLHjh3NXhNATk4OvXr1wsjIiGPHjhEZGcnUqVNRqVS/q523vPHGGyxdupSIiAj09fWZOnWqbt/evXsZNWoUQ4YM4erVqxw9epROnTrp9s+cOZPz58+zadMmYmJiGDt2LIMGDSIpKQmA559/HoVCwalTp4iNjeXDDz/E1NT0t2+y6E8jDvMTiUQikUgk+ge4lFbapEfqTgKQV1HPpbRSuray+dPrX7VqFZMmTQJg0KBBVFRUcPLkSXr37t2o3IIFCxgwYAAAP/74I66uruzYsYNx48YB2oVOV6xYQefOnXVlAgMDuXTpUqNAoTkKhYLFixdz5MgRunbtCoC3tzdnzpzh22+/JTw8nK+//ppWrVqxdOlSAPz9/XWBxN106dKF119/nccee4wZM2bQqVMn+vbty+TJk3FwcADgyy+/xMLCgk2bNmFgYACAn5/f727nLe+//77u8WuvvcbQoUOpr69HJpPx/vvv8+ijj7Jw4UJd+dDQUAAyMzNZvXo1mZmZuuGWc+fO5cCBA6xevZrFixeTmZnJmDFjCA4O1rVB9NcSe6ZEIpFIJBKJ/gEKq+4eSP2ecvcjISGBS5cuMWHCBAD09fUZP348q1atalL2VvAAYG1tjb+/Pzdu3NBt09fXp2PHjrrHAQEBWFpaNipzN8nJydTW1jJgwABMTU11P2vXrtUNt7tx44YuUGuuTXfz/vvvk5+fzzfffEObNm345ptvCAgIIDY2FoCoqCh69uypC6T+aDtvCQkJ0f3fyUk75+1W0ouoqCj69evXbB2xsbGo1Wr8/Pwa1XHy5EldHbNnz+a9996je/fuLFiwgJiYmN9su+jPJfZMiUQikUgkEv0D2JvJ/tRy92PVqlWoVKpGCScEQcDIyIgVK1b8ZckXqqurAe3wNxcXl0b7jIyM/vD5bWxsGDt2LGPHjmXx4sWEhYXxySef8OOPP2JsbPxA2nlncCaRSADtfCvgnnVWV1cjlUqJjIxEKpU22ndrKN+0adMYOHAge/fu5dChQyxZsoSlS5cya9asFl+L6I8Re6ZEIpFIJBKJ/gE6eVnjZCFDcpf9ErRZ/Tp5Wf+p9apUKtauXcvSpUuJiorS/URHR+Ps7MzGjRsblb9w4YLu/2VlZSQmJhIYGNjofBEREbrHCQkJlJeXNypzN61bt8bIyIjMzEx8fHwa/bi5uQHohgzerU0tZWhoSKtWrXTZ/EJCQjh9+nSj+V9/pJ0tERISwtGjR5vdFxYWhlqtprCwsEkdjo6OunJubm7MmDGD7du38/LLL/P999+3uH7RHyf2TIlEov+cEwmFfHcyhZ+e6oxUKn5nJBKJ/h2kehIWDG/Ns+uuIIFGiShuBVgLhrdGqne3cOv32bNnD2VlZTz11FNNeqDGjBnDqlWrmDFjhm7bokWLsLGxwcHBgTfeeANbW9tGWe4MDAyYNWsWn3/+Ofr6+sycOZMuXbr85nwpADMzM+bOnctLL72ERqOhR48eVFRUcPbsWczNzXniiSeYMWMGS5cu5ZVXXmHatGlERkb+5vpPe/bsYdOmTTz66KP4+fkhCAK7d+9m3759rF69GtAme/jiiy949NFHmT9/PhYWFly4cIFOnTrh7+9/3+1siQULFtCvXz9atWrFo48+ikqlYt++fcybNw8/Pz8mTpzI5MmTWbp0KWFhYRQVFXH06FFCQkIYOnQoL774IoMHD8bPz4+ysjKOHz/eoqBV9OcRP2WIRKL/HEu5AQ4WMjGQEolE/zqDgpz4elI7HC0aD+VztJDx9aR2D2SdqVWrVtG/f/9mh/KNGTOGiIiIRnNxPvjgA1544QXat29Pfn4+u3fvxtDQULdfLpczb948HnvsMbp3746pqSmbN29ucXveffdd3nrrLZYsWUJgYCCDBg1i7969uhTm7u7u/Pzzz+zcuZPQ0FC++eYbFi9efM9ztm7dGrlczssvv0zbtm3p0qULW7ZsYeXKlTz++OOAdgjgsWPHqK6uJjw8nPbt2/P999/fdQ7Vb7WzJXr37s3WrVvZtWsXbdu2pW/fvo163VavXs3kyZN5+eWX8ff35+GHH+by5cu4u7sDoFaref7553X1+/n58dVXX7W4ftEfJxHulh9S1KzKykosLCyoqKjA3Nz8726OSCQSiUSiFvgr3r/r6+tJS0vDy8sLmeyPzWtSawQupZVSWFWPvZl2aN+f3SN1v06cOEGfPn0oKyvD0tKy2TJr1qzhxRdfpLy8/C9tm0j0Z7qfv2VxmJ9IJGqkvLaBA9fyebST+z3L7YrOwc1KzrcnU5j
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"wos_plot = wos_nlp.merge(tnse_data, on=record_col)\n",
"\n",
"g = sns.scatterplot(wos_plot[wos_plot[\"Domain_English\"] != 'article-level classification'], x=\"TNSE-X\", y=\"TNSE-Y\",\n",
" hue='Domain_English', s=1)\n",
"g.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\n",
"wos_plot.head()\n",
"wos_nlp = wos_plot[[record_col, \"Document\", \"keyword_all\", \"TNSE-X\", \"TNSE-Y\"]]\n"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 40,
"outputs": [],
"source": [
"\n",
"wos_nlp.to_excel(f\"{outdir}/wos_nlp.xlsx\", index=False)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 93,
"outputs": [],
"source": [
"wos_nlp.to_csv(f\"{outdir}/wos_nlp.csv\", index=False, sep='\\t')"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 37,
"outputs": [
{
"data": {
"text/plain": "Index(['UT (Unique WOS ID)', 'Document', 'keyword_all', 'TNSE-X', 'TNSE-Y'], dtype='object')"
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos_nlp.columns"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 94,
"outputs": [
{
"data": {
"text/plain": "<Axes: ylabel='Frequency'>"
},
"execution_count": 94,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAlQAAAGdCAYAAADUl+3IAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA1aklEQVR4nO3deXRU9f3/8Ve2ScIyCVsSUgKkBYUIggQIU9B++ZISNLYi2AIipIjygwaERNmqgrV+BfGAQFlSa2voqZTlHKFKSjANm0pkCSCLEqmiQcMkWEgGoiQhc39/9Jv7ZQgq5I5MlufjnHuOcz/v3Hm/Rzt59ebOHT/DMAwBAACgzvx93QAAAEBDR6ACAACwiEAFAABgEYEKAADAIgIVAACARQQqAAAAiwhUAAAAFhGoAAAALAr0dQONhdvtVlFRkVq2bCk/Pz9ftwMAAK6DYRi6cOGCoqOj5e9f9/NMBCovKSoqUkxMjK/bAAAAdXD69Gl16NChzj9PoPKSli1bSvrPvxC73e7jbgAAwPVwuVyKiYkxf4/XFYHKS2r+zGe32wlUAAA0MFYv1+GidAAAAIsIVAAAABYRqAAAACwiUAEAAFhEoAIAALCIQAUAAGARgQoAAMAiAhUAAIBFBCoAAACLCFQAAAAWEagAAAAsIlABAABYRKACAACwiEAFAABgUaCvG8D16Twny9ct3LBPFyb7ugUAAG4KzlABAABYRKACAACwiEAFAABgEYEKAADAIgIVAACART4PVF988YUeeughtWnTRqGhoerZs6cOHDhgrhuGoXnz5ql9+/YKDQ1VYmKiTp486XGMc+fOaezYsbLb7QoPD9fEiRN18eJFj5ojR47ozjvvVEhIiGJiYrRo0aJavWzcuFHdunVTSEiIevbsqX/84x/fz9AAAKBR8WmgOn/+vAYOHKigoCBt3bpVH3zwgRYvXqxWrVqZNYsWLdLy5cuVkZGhvXv3qnnz5kpKStKlS5fMmrFjx+r48ePKycnRli1btHv3bk2aNMlcd7lcGjp0qDp16qT8/Hy9+OKLeuaZZ/Tyyy+bNXv27NGYMWM0ceJEHTp0SMOHD9fw4cN17Nixm/NiAACABsvPMAzDV08+Z84cvfvuu3r77bevuW4YhqKjo/X444/riSeekCSVlZUpMjJSmZmZGj16tD788EPFxcVp//796tu3ryQpOztb99xzjz7//HNFR0dr9erVevLJJ+V0OmWz2czn3rx5s06cOCFJGjVqlMrLy7Vlyxbz+QcMGKDevXsrIyPjO2dxuVwKCwtTWVmZ7Ha7pdflWrgPFQAA3uet398+PUP1xhtvqG/fvvrFL36hiIgI3XHHHfrjH/9orp86dUpOp1OJiYnmvrCwMCUkJCgvL0+SlJeXp/DwcDNMSVJiYqL8/f21d+9es+auu+4yw5QkJSUlqaCgQOfPnzdrrnyempqa57laRUWFXC6XxwYAAJomnwaqTz75RKtXr1bXrl21bds2TZkyRY899pjWrFkjSXI6nZKkyMhIj5+LjIw015xOpyIiIjzWAwMD1bp1a4+aax3jyuf4ppqa9astWLBAYWFh5hYTE3PD8wMAgMbBp4HK7XarT58+ev7553XHHXdo0qRJevTRR6/rT2y+NnfuXJWVlZnb6dOnfd0SAADwEZ8Gqvbt2ysuLs5jX/fu3VVYWChJioqKkiQVFxd71BQXF5trUVFRKikp8Vi/fPmyzp0751FzrWNc+RzfVFOzfrXg4GDZ7XaPDQAANE0+DVQDBw5UQUGBx76PPvpInTp1kiTFxsYqKipKubm55rrL5dLevXvlcDgkSQ6HQ6WlpcrPzzdrtm/fLrfbrYSEBLNm9+7dqqqqMmtycnJ06623mp8odDgcHs9TU1PzPAAAAN/Ep4EqLS1N7733np5//nn961//0tq1a/Xyyy8rNTVVkuTn56cZM2boueee0xtvvKGjR49q/Pjxio6O1vDhwyX954zWsGHD9Oijj2rfvn169913NXXqVI0ePVrR0dGSpAcffFA2m00TJ07U8ePHtX79ei1btkzp6elmL9OnT1d2drYWL16sEydO6JlnntGBAwc0derUm/66AACAhiXQl0/er18/bdq0SXPnztWzzz6r2NhYLV26VGPHjjVrZs2apfLyck2aNEmlpaUaNGiQsrOzFRISYta89tprmjp1qoYMGSJ/f3+NHDlSy5cvN9fDwsL01ltvKTU1VfHx8Wrbtq3mzZvnca+qH//4x1q7dq2eeuop/eY3v1HXrl21efNm9ejR4+a8GAAAoMHy6X2oGhPuQ1Ub96ECANR3jeI+VAAAAI0BgQoAAMAiAhUAAIBFBCoAAACLCFQAAAAWEagAAAAsIlABAABYRKACAACwiEAFAABgEYEKAADAIgIVAACARQQqAAAAiwhUAAAAFhGoAAAALCJQAQAAWESgAgAAsIhABQAAYBGBCgAAwCICFQAAgEUEKgAAAIsIVAAAABYRqAAAACwiUAEAAFhEoAIAALCIQAUAAGARgQoAAMAiAhUAAIBFBCoAAACLCFQAAAAWEagAAAAsIlABAABYRKACAACwiEAFAABgEYEKAADAIgIVAACARQQqAAAAiwhUAAAAFhGoAAAALCJQAQAAWESgAgAAsIhABQAAYBGBCgAAwCICFQAAgEUEKgAAAIsIVAAAABYRqAAAACzyaaB65pln5Ofn57F169bNXL906ZJSU1PVpk0btWjRQiNHjlRxcbHHMQoLC5WcnKxmzZopIiJCM2fO1OXLlz1qdu7cqT59+ig4OFhdunRRZmZmrV5Wrlypzp07KyQkRAkJCdq3b9/3MjMAAGh8fH6G6rbbbtOZM2fM7Z133jHX0tLS9Oabb2rjxo3atWuXioqKNGLECHO9urpaycnJqqys1J49e7RmzRplZmZq3rx5Zs2pU6eUnJyswYMH6/Dhw5oxY4YeeeQRbdu2zaxZv3690tPTNX/+fB08eFC9evVSUlKSSkpKbs6LAAAAGjQ/wzAMXz35M888o82bN+vw4cO11srKytSuXTutXbtWDzzwgCTpxIkT6t69u/Ly8jRgwABt3bpV9957r4qKihQZGSlJysjI0OzZs3X27FnZbDbNnj1bWVlZOnbsmHns0aNHq7S0VNnZ2ZKkhIQE9evXTytWrJAkud1uxcTEaNq0aZozZ851zeJyuRQWFqaysjLZ7XYrL8s1dZ6T5fVjft8+XZjs6xYAAPhW3vr97fMzVCdPnlR0dLR++MMfauzYsSosLJQk5efnq6qqSomJiWZtt27d1LFjR+Xl5UmS8vLy1LNnTzNMSVJSUpJcLpeOHz9u1lx5jJqammNUVlYqPz/fo8bf31+JiYlmzbVUVFTI5XJ5bAAAoGnyaaBKSEhQZmamsrOztXr1ap06dUp33nmnLly4IKfTKZvNpvDwcI+fiYyMlNPplCQ5nU6PMFWzXrP2bTUul0tff/21vvzyS1VXV1+zpuYY17JgwQKFhYWZW0xMTJ1eAwAA0PAF+vLJ7777bvOfb7/9diUkJKhTp07asGGDQkNDfdjZd5s7d67S09PNxy6Xi1AFAEAT5fM/+V0pPDxct9xyi/71r38pKipKlZWVKi0t9agpLi5WVFSUJCkqKqrWp/5qHn9Xjd1uV2hoqNq2bauAgIBr1tQc41qCg4Nlt9s9NgAA0DTVq0B18eJFffzxx2rfvr3i4+MVFBSk3Nxcc72goECFhYVyOBySJIfDoaNHj3p8Gi8nJ0d2u11xcXFmzZXHqKmpOYbNZlN8fLxHjdvtVm5urlkDAADwbXwaqJ544gnt2rVLn376qfbs2aP7779fAQEBGjNmjMLCwjRx4kSlp6drx44dys/P14QJE+RwODRgwABJ0tChQxUXF6dx48bp/fff17Zt2/TUU08pNTVVwcHBkqTJkyfrk08+0axZs3TixAmtWrVKGzZsUFpamtlHenq6/vjHP2rNmjX68MMPNWXKFJWXl2vChAk+eV0AAEDD4tNrqD7//HONGTNG//73v9WuXTsNGjRI7733ntq1aydJeumll+Tv76+RI0eqoqJCSUlJWrVqlfn
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import spacy\n",
"\n",
"nlp = spacy.load(\"en_core_web_lg\")\n",
"kwd_nlp = pd.DataFrame(kw_df[\"keyword_all\"].drop_duplicates())\n",
"# wos_kwd_test[\"BERT_KWDS\"] = wos_kwd_test[\"Document\"].map(kwd_extract)\n",
"\n",
"vectors = list()\n",
"vector_norms = list()\n",
"\n",
"for doc in nlp.pipe(kwd_nlp['keyword_all'].astype('unicode').values, batch_size=300,\n",
" n_process=4):\n",
" vectors.append(doc.vector)\n",
" vector_norms.append(doc.vector_norm)\n",
"\n",
"kwd_nlp['vector'] = vectors\n",
"kwd_nlp['vector_norm'] = vector_norms\n",
"kwd_nlp['vector_norm'].plot(kind=\"hist\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 95,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) TNSE-X TNSE-Y\n0 COMPARATIVE GENOMICS -114.811630 -43.915569\n1 ANAMMOX 8.044455 100.761032\n2 KUENENIA STUTTGARTIENSIS 8.044455 100.761032\n3 METAGENOMICS 8.044455 100.761032\n4 ENRICHMENT CULTURE -99.356590 -78.270439",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>TNSE-X</th>\n <th>TNSE-Y</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>COMPARATIVE GENOMICS</td>\n <td>-114.811630</td>\n <td>-43.915569</td>\n </tr>\n <tr>\n <th>1</th>\n <td>ANAMMOX</td>\n <td>8.044455</td>\n <td>100.761032</td>\n </tr>\n <tr>\n <th>2</th>\n <td>KUENENIA STUTTGARTIENSIS</td>\n <td>8.044455</td>\n <td>100.761032</td>\n </tr>\n <tr>\n <th>3</th>\n <td>METAGENOMICS</td>\n <td>8.044455</td>\n <td>100.761032</td>\n </tr>\n <tr>\n <th>4</th>\n <td>ENRICHMENT CULTURE</td>\n <td>-99.356590</td>\n <td>-78.270439</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 95,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.manifold import TSNE\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"# % matplotlib inline\n",
"\n",
"vector_data = pd.DataFrame(kwd_nlp[\"vector\"].to_list(), index=kwd_nlp[\"keyword_all\"]).reset_index()\n",
"vector_data.head()\n",
"\n",
"labels = vector_data.values[:, 0]\n",
"record_vectors = vector_data.values[:, 1:]\n",
"\n",
"tsne_model = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, random_state=42, metric='cosine')\n",
"tnse_2d = tsne_model.fit_transform(record_vectors)\n",
"tnse_data = pd.DataFrame(tnse_2d, index=labels).reset_index()\n",
"tnse_data.columns = [record_col, \"TNSE-X\", \"TNSE-Y\"]\n",
"tnse_data.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 96,
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.\n"
]
},
{
"data": {
"text/plain": "<matplotlib.legend.Legend at 0x1b1f8532370>"
},
"execution_count": 96,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAmsAAAGwCAYAAAD2XSKVAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOy9fXRU9b3v/yIkM3mcDJOnScJkIAwhIRAQQiFIxEjUUrGt0B7UeuSWU6pGxXps69V1df1++lva2uNCUVM89NLitSKnF2iVnlRNCBghUBKBgUiYDCHJkMlk8sBk8jiTkPz+2LM3eYQAQQh8X2uxgMzOzN6z98z3vT8P78+Evr6+PgQCgUAgEAgENyR+13sHBAKBQCAQCAQjI8SaQCAQCAQCwQ2MEGsCgUAgEAgENzBCrAkEAoFAIBDcwAixJhAIBAKBQHADI8SaQCAQCAQCwQ2MEGsCgUAgEAgENzD+13sHxhu9vb3Y7XbCwsKYMGHC9d4dgUAgEAgEo6Cvr4/W1lbi4uLw87v2sarz58/T3d097GMBAQFMnDhx1M8lxNplYrfbMRgM13s3BAKBQCAQXAE2m43Jkydfs+fv6+vD4XDgcrkuup1Wq0Wv148q8CPE2mUSFhYGSCdbo9Fc570RCAQCgUAwGtxuNwaDQVnHrxWyUIuOjiY4OHiIGOvr66OjowOn0wlAbGzsJZ9TiLXLRH7TNRqNEGsCgUAgEIwzrmUJ0/nz5xWhFhERMeJ2QUFBADidTqKjoy+ZEhUNBgKBQCAQCARjgFyjFhwcfMlt5W1GqmvrjxBrAoFAIBAIBGPIaKJ3lxPhE2JNIBAIBAKB4AZGiDWBQCAQCASCGxgh1gQCgUAgEAhuYIRYEwgEAoFAIBhD+vr6xmQbGSHWBAKBQCAQCMaAgIAAADo6Oi65rbyN/DsXQ/isCQQCgUAgEIwBEydORKvVKoa3lzLF1Wq1oxo7JcSaQCAQCAQCwRih1+sBFME2EvK4qdEgxJpAIBAIBALBGDFhwgRiY2OJjo4Wg9wFAoFAIBAIblQmTpx4WYLsYogGA4FAIBAIBIIbGCHWBAKBQCAQCG5ghFgTCATjFrur83rvgkAgEFxzhFgTCATXhasVWnZXJ0t/V3hNBZsQgwKB4EZAiDWBQPCtMxZCK04bxL5fZRGnDRr1a14O34YYFAgEgtEgxJpAIPjWuVyhdbHnGQ1XIrzGah8FAoHgahFiTSAQXBe+TRF0pcJLCDWBQHAjIMSaQCC4JlxN+vBapB6F8BIIBOMVIdYEAsGouBwBdTX1XqJWTCAQCAYixJpAILgklyugrqbea7S/K8ScQCC4VRBiTSAQXJIrEV9Xk3YcjVAT0TeBQHCrIMSaQCAYFTdSzZfo1BQIBLcSQqwJBLcYN0s0Sgg1gUBwqyDEmkBwC3Ejpg9vpH0RCASCGxEh1gSCW4ixSh+OVmBdarsbUTwKBALBjYYQawLBLcZYCLXRCKzRbCdqzwQCgeDSTOjr6+u73jsxnnC73YSHh9PS0oJGo7neuyMQjDl2V+eoujFHI7BGu53g+iHOkeBWYTyv3yKyJhAIFEYbNRvt4i5EwI2NSEMLBOMDEVm7TMazMhcIRoOItNxaiPMtuFUYz+u3iKwJBIIBiIX71kKcb4HgxkeINYFgHDNWXZnfxj4IBAKB4MoQYk0gGKeMZVfmtd4HgUAgEFw5ombtMhnPOW/Bzce17socy85QgUAguJ6M5/VbRNYEgnHMtezKHOvOUIFAIBBcGeNGrP3+978nLS0NjUaDRqMhIyODvLw85fGuri6efPJJIiIiCA0NZdWqVdTX1w94jpqaGu677z6Cg4OJjo7mV7/6FT09Pd/2oQgE4wJhWCsQCAQ3BuNGrE2ePJnf/OY3lJaWUlJSwl133cUPfvADysrKAHj22Wf59NNP+ctf/sK+ffuw2+2sXLlS+f3z589z33334fV6OXDgAFu3buVPf/oTL7/88vU6JIHguiOiZgKBQHDjM65r1nQ6Hb/73e/40Y9+RFRUFB999BE/+tGPACgvLyclJYXi4mIWLVpEXl4eK1aswG63ExMTA8CmTZt4/vnnaWhoQKVSjeo1x3POWzA+uVY1YXKaU0TPBALBrcB4Xr/HTWStP+fPn+fjjz+mvb2djIwMSktL6e7uJjs7W9kmOTmZhIQEiouLASguLmb27NmKUAO49957cbvdSnRuODweD263e8AfgeDb4mq6LUcTNRNCTSAQCG58xpVYO378OKGhoajVah5//HF27drFzJkzcTgcqFQqtFrtgO1jYmJwOBwAOByOAUJNflx+bCRef/11wsPDlT8Gg2FsD0oguAhXKqhEc4BAIBDcPIwrsTZjxgyOHj3KoUOHeOKJJ1izZg3ffPPNNX3NF154gZaWFuWPzWa7pq8nEAzmSgSViJoJBALBzYP/9d6By0GlUmEymQCYP38+hw8f5u2332b16tV4vV5cLteA6Fp9fT16vR4AvV7PP//5zwHPJ3eLytsMh1qtRq1Wj/GRCATXHiHUBAKB4OZgXEXWBtPb24vH42H+/PkEBARQUFCgPHbq1ClqamrIyMgAICMjg+PHj+N0OpVtvvjiCzQaDTNnzvzW910gkBHu/wKBQCC4GOMmsvbCCy+wfPlyEhISaG1t5aOPPmLv3r189tlnhIeH82//9m/8+7//OzqdDo1Gw9NPP01GRgaLFi0C4J577mHmzJn867/+K2+88QYOh4P/9b/+F08++aSInAmuG1fTkSkmBwgEAsGtwbgRa06nk0cffZS6ujrCw8NJS0vjs88+4+677wZgw4YN+Pn5sWrVKjweD/feey+5ubnK70+cOJHdu3fzxBNPkJGRQUhICGvWrOGVV165XockuMm4EvF0tQ0Eoi5NIBAIbn7Gtc/a9WA8+7QIrh3XQzzd6JG1G33/BALBrcV4Xr/Hdc2aQHCjcD26L29kIXQ1/nACgUAgGIgQawLBGHEji6dvG2EdIhAIBGOHEGsCgeCaIISaQCAQjA1CrAkEAoFAIBDcwAixJhAIBKNgQ77leu+CQCC4RRFiTSC4TETR/K3B5r0VABRZnGzIt5BbaBWCTSAQXBfGjc+aQHAjIPzNbg02763gjS8qaO/pw9vdzeavqsnJMpEcHXq9d00gENyCiMiaQMDoo2Wiy/HmJc9sV/69cFoUjyw00NLhYfNX1WSnRAHwWt5Jth2suk57KBAIblWEWBPc8lyuJ5gQajcfeWY72w7XKILtQIUTjcqPDw/ZeGShgfyTDdS3dABwrNaF2ea6jnsrEAhuNYRYE9zyiGiZ4ExDG8WVzZx2tvFuvoV/fFNPbpEUUXO4u0iNDcPu6mJqZAg7vrZTVuu63rssEAhuIUTNmkCAiJbdiOQWWPD0wbPZSdf8tXKWJXG+D/5Z3cyZxnbuTY1BExRAu+c8xZXN5GQayS2q5vFMI4um6Hho0ZRrvk8CgUAgIyJrAoHghiO3wMKGPVYsjpZr3oH5Xr6FPLOdli4vxZXNRISoKKk6B4C7s5vslCglyvbXYw4OVjVTZHFe030SCASC/gixJhAIbjimRoWyal4c+ScbiAi8dgmADfkWPi61sX77UT4rq+eJTCNN7V7K6lo509jO7Mnh1Lm6eDzTyIlaNz9ZYKC4shmAjfkWYeMiEAi+FUQaVCAQ3HAsT4sDYGaMhkeXJF6T19h2sIrcQiuPLDSgC1bzjaOF3/tSnUXWZpaadOQWVbMmI4GG9m4cbg8AW9akU3y6gU/NDs51eFh3h0mk0QUCwTVFiDWB4DKxuzrF4vwtIAu2scZsc5Fm0Cp1Z6nxWiLD1PSWQHhQAJuKqtFr1BypbSUjUUetq5P8kw2szzLx+F0X6uc2f1XNXckx4loQCATXHJEGFQgug8u1+RDcWJhtLj76Z5VSB/fQoilEhqn53Wcn+b+lNnZ8bScjUcfUyBCKK5tJN+rIP9lARqIOZ1uXYtmRmRTNjscXk5kUfR2PRiAQ3CqIyJpAwOijZcLmY3xz6HQDO762kxobxgTgF9lJxGmDuC1eS+a0SDw9vXzjcLP3VAN6jZpQlR+/vns6X5918fH
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"g = sns.scatterplot(tnse_data, x=\"TNSE-X\", y=\"TNSE-Y\", s=1)\n",
"g.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 99,
"outputs": [],
"source": [
"wos_nlp.to_csv(f\"{outdir}/wos_nlp.csv\", index=False, sep='\\t')\n",
"tnse_data.to_csv(f\"{outdir}/kw_nlp.csv\", index=False, sep='\\t')\n",
"\n",
"wos_nlp.to_excel(f\"{outdir}/wos_nlp.xlsx\", index=False)\n",
"tnse_data.drop_duplicates(subset=record_col).to_excel(f\"{outdir}/kw_nlp.xlsx\", index=False)"
],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
}
},
"nbformat": 4,
"nbformat_minor": 1
}