You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ZSI_Reconnect_China/WOS/wos_processing_pipeline.ipynb

1171 lines
651 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import os\n",
"import shutil\n",
"from flashgeotext.geotext import GeoText\n",
"import re"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 6,
"outputs": [],
"source": [
"import hashlib\n",
"\n",
"def md5hash(s: str):\n",
" return hashlib.md5(s.encode('utf-8')).hexdigest()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"record_col=\"UT (Unique WOS ID)\"\n",
"outfile = r\"C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\WOS\\wos_extract\\wos_records_concat.csv\""
]
},
{
"cell_type": "code",
"execution_count": 7,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of initial records: 41511\n",
"Number of filtered records: 35663\n"
]
}
],
"source": [
"wos = pd.read_csv(outfile, sep=\"\\t\",low_memory=False)\n",
"print(f'Number of initial records: {len(wos)}')\n",
"metrix = pd.read_excel(\"sm_journal_classification.xlsx\", sheet_name=\"Journal_Classification\")\n",
"\n",
"\n",
"metrix = metrix.set_index([c for c in metrix.columns if \"issn\" not in c]).stack().reset_index()\n",
"metrix = metrix.rename(columns={'level_6':\"issn_type\", 0:\"issn\"})\n",
"metrix[\"issn\"]=metrix[\"issn\"].str.replace(\"-\",\"\").str.lower().str.strip()\n",
"\n",
"wos[\"issn\"] = wos[\"ISSN\"].str.replace(\"-\",\"\").str.lower().str.strip()\n",
"wos[\"eissn\"] = wos[\"eISSN\"].str.replace(\"-\",\"\").str.lower().str.strip()\n",
"wos = wos.set_index([c for c in wos.columns if \"issn\" not in c]).stack().reset_index()\n",
"wos = wos.rename(columns={'level_71':\"issn_var\", 0:\"issn\"})\n",
"\n",
"wos_merge = wos.merge(metrix, on=\"issn\", how=\"left\")\n",
"wos = wos_merge.sort_values(by=\"issn_var\",ascending=False).drop_duplicates(subset=record_col)\n",
"\n",
"# drop entries not indexed by metrix\n",
"wos = wos[~wos[\"Domain_English\"].isna()]\n",
"# drop duplicates (based on doi)\n",
"wos = wos[~((~wos[\"DOI\"].isna())&(wos[\"DOI\"].duplicated(False)))]\n",
"wos = wos.drop_duplicates(subset=[\"Publication Type\",\"Document Type\",\"Authors\",\"Article Title\",\"Source Title\",\"Publication Year\"])\n",
"wos = wos[((wos[\"Publication Year\"]<2023) & (~wos['Domain_English'].isna()))]\n",
"print(f'Number of filtered records: {len(wos)}')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 9,
"outputs": [
{
"data": {
"text/plain": " Article Title \n38882 Data Collection for Security Measurement in Wi... \\\n25163 A hybrid pricing and cutting approach for the ... \n8801 Enhanced implicit function-based network for a... \n3178 Guaranteed Classification via Regularized Simi... \n40567 A Framework for Learning Analytics Using Commo... \n... ... \n6520 Module dividing for brain functional networks ... \n11606 Unsupervised Learning for Monaural Source Sepa... \n40742 Task-dependent modulation of effective connect... \n8313 Multi-lesion radiomics of PET/CT for non-invas... \n6996 GALAXY: A new hybrid MOEA for the optimal desi... \n\n Keywords Plus \n38882 ATTACK DETECTION; CLONE ATTACKS; ARCHITECTURE \\\n25163 NaN \n8801 NaN \n3178 NaN \n40567 NaN \n... ... \n6520 CONVOLUTIONAL NEURAL-NETWORK; EFFECTIVE CONNEC... \n11606 NONNEGATIVE MATRIX FACTORIZATION; BLIND SOURCE... \n40742 NaN \n8313 INTERNATIONAL ASSOCIATION; STAGE-I; CLASSIFICA... \n6996 EVOLUTIONARY MULTIOBJECTIVE OPTIMIZATION; GENE... \n\n Author Keywords \n38882 Attack detection; Internet of Things (IoT); se... \n25163 Transportation; Full truckload transport; Colu... \n8801 arbitrary-scale image super-resolution; local-... \n3178 NaN \n40567 wearable sensors; learning analytics; pervasiv... \n... ... \n6520 Brain functional network; Betweenness efficien... \n11606 adaptive signal processing; blind source separ... \n40742 default mode network; effective connectivity; ... \n8313 Lung adenocarcinoma; Positron-emission tomogra... \n6996 hybrid algorithm; exploration and exploitation... \n\n[100 rows x 3 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Article Title</th>\n <th>Keywords Plus</th>\n <th>Author Keywords</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>38882</th>\n <td>Data Collection for Security Measurement in Wi...</td>\n <td>ATTACK DETECTION; CLONE ATTACKS; ARCHITECTURE</td>\n <td>Attack detection; Internet of Things (IoT); se...</td>\n </tr>\n <tr>\n <th>25163</th>\n <td>A hybrid pricing and cutting approach for the ...</td>\n <td>NaN</td>\n <td>Transportation; Full truckload transport; Colu...</td>\n </tr>\n <tr>\n <th>8801</th>\n <td>Enhanced implicit function-based network for a...</td>\n <td>NaN</td>\n <td>arbitrary-scale image super-resolution; local-...</td>\n </tr>\n <tr>\n <th>3178</th>\n <td>Guaranteed Classification via Regularized Simi...</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>40567</th>\n <td>A Framework for Learning Analytics Using Commo...</td>\n <td>NaN</td>\n <td>wearable sensors; learning analytics; pervasiv...</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>6520</th>\n <td>Module dividing for brain functional networks ...</td>\n <td>CONVOLUTIONAL NEURAL-NETWORK; EFFECTIVE CONNEC...</td>\n <td>Brain functional network; Betweenness efficien...</td>\n </tr>\n <tr>\n <th>11606</th>\n <td>Unsupervised Learning for Monaural Source Sepa...</td>\n <td>NONNEGATIVE MATRIX FACTORIZATION; BLIND SOURCE...</td>\n <td>adaptive signal processing; blind source separ...</td>\n </tr>\n <tr>\n <th>40742</th>\n <td>Task-dependent modulation of effective connect...</td>\n <td>NaN</td>\n <td>default mode network; effective connectivity; ...</td>\n </tr>\n <tr>\n <th>8313</th>\n <td>Multi-lesion radiomics of PET/CT for non-invas...</td>\n <td>INTERNATIONAL ASSOCIATION; STAGE-I; CLASSIFICA...</td>\n <td>Lung adenocarcinoma; Positron-emission tomogra...</td>\n </tr>\n <tr>\n <th>6996</th>\n <td>GALAXY: A new hybrid MOEA for the optimal desi...</td>\n <td>EVOLUTIONARY MULTIOBJECTIVE OPTIMIZATION; GENE...</td>\n <td>hybrid algorithm; exploration and exploitation...</td>\n </tr>\n </tbody>\n</table>\n<p>100 rows × 3 columns</p>\n</div>"
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos[[\"Article Title\",\"Keywords Plus\",\"Author Keywords\"]].sample(100)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 10,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) keyword_all\n0 WOS:000208863600013 COMPARATIVE GENOMICS\n1 WOS:000208863600013 ANAMMOX\n2 WOS:000208863600013 KUENENIA STUTTGARTIENSIS\n3 WOS:000208863600013 METAGENOMICS\n4 WOS:000208863600013 ENRICHMENT CULTURE\n.. ... ...\n97 WOS:000209724300006 VIRTUAL DISKS\n98 WOS:000209724300006 HETEROGENEOUS SERVICES\n99 WOS:000209810700046 CORROSION CHARACTERIZATION\n100 WOS:000209810700046 FEATURE EXTRACTION\n101 WOS:000209810700046 PULSED EDDY CURRENT\n\n[100 rows x 2 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>keyword_all</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000208863600013</td>\n <td>COMPARATIVE GENOMICS</td>\n </tr>\n <tr>\n <th>1</th>\n <td>WOS:000208863600013</td>\n <td>ANAMMOX</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000208863600013</td>\n <td>KUENENIA STUTTGARTIENSIS</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000208863600013</td>\n <td>METAGENOMICS</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000208863600013</td>\n <td>ENRICHMENT CULTURE</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>97</th>\n <td>WOS:000209724300006</td>\n <td>VIRTUAL DISKS</td>\n </tr>\n <tr>\n <th>98</th>\n <td>WOS:000209724300006</td>\n <td>HETEROGENEOUS SERVICES</td>\n </tr>\n <tr>\n <th>99</th>\n <td>WOS:000209810700046</td>\n <td>CORROSION CHARACTERIZATION</td>\n </tr>\n <tr>\n <th>100</th>\n <td>WOS:000209810700046</td>\n <td>FEATURE EXTRACTION</td>\n </tr>\n <tr>\n <th>101</th>\n <td>WOS:000209810700046</td>\n <td>PULSED EDDY CURRENT</td>\n </tr>\n </tbody>\n</table>\n<p>100 rows × 2 columns</p>\n</div>"
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kw_df = pd.DataFrame()\n",
"for c in [\"Keywords Plus\",\"Author Keywords\"]:\n",
" kwp = wos.groupby(record_col)[c].apply(lambda x: x.str.split(';')).explode().str.strip().str.upper()\n",
" kwp.name = 'keyword_all'\n",
" kw_df = pd.concat([kwp.reset_index(),kw_df],ignore_index=True)\n",
"kw_df = kw_df[~kw_df[\"keyword_all\"].isna()].copy().drop(columns=\"level_1\").drop_duplicates()\n",
"kw_df[\"keyword_all\"] = kw_df[\"keyword_all\"].apply(lambda x: re.sub(\"[\\(\\[].*?[\\)\\]]\", \"\", x))\n",
"kw_df.head(100)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 11,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) keyword_all\n0 WOS:000208863600013 COMPARATIVE GENOMICS; ANAMMOX; KUENENIA STUTTG...\n1 WOS:000208863600266 ANME; PYROSEQUENCING; AOM; COMMUNITY STRUCTURE...\n2 WOS:000208863900217 DEFAULT MODE NETWORK; EFFECTIVE CONNECTIVITY; ...\n3 WOS:000208972600008 BRAIN-MACHINE INTERFACE ; FIELD-PROGRAMMABLE G...\n4 WOS:000209043200014 CYANOBACTERIA BLOOM; DRINKING WATER TREATMENT;...",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>keyword_all</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000208863600013</td>\n <td>COMPARATIVE GENOMICS; ANAMMOX; KUENENIA STUTTG...</td>\n </tr>\n <tr>\n <th>1</th>\n <td>WOS:000208863600266</td>\n <td>ANME; PYROSEQUENCING; AOM; COMMUNITY STRUCTURE...</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000208863900217</td>\n <td>DEFAULT MODE NETWORK; EFFECTIVE CONNECTIVITY; ...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000208972600008</td>\n <td>BRAIN-MACHINE INTERFACE ; FIELD-PROGRAMMABLE G...</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000209043200014</td>\n <td>CYANOBACTERIA BLOOM; DRINKING WATER TREATMENT;...</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos_kwd_concat = kw_df.groupby(record_col, as_index=False).agg({'keyword_all': '; '.join})\n",
"wos_kwd_concat.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 11,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [
{
"data": {
"text/plain": "Index(['Publication Type', 'Authors', 'Book Authors', 'Book Editors',\n 'Book Group Authors', 'Author Full Names', 'Book Author Full Names',\n 'Group Authors', 'Article Title', 'Source Title', 'Book Series Title',\n 'Book Series Subtitle', 'Language', 'Document Type', 'Conference Title',\n 'Conference Date', 'Conference Location', 'Conference Sponsor',\n 'Conference Host', 'Author Keywords', 'Keywords Plus', 'Abstract',\n 'Addresses', 'Affiliations', 'Reprint Addresses', 'Email Addresses',\n 'Researcher Ids', 'ORCIDs', 'Funding Orgs', 'Funding Name Preferred',\n 'Funding Text', 'Cited References', 'Cited Reference Count',\n 'Times Cited, WoS Core', 'Times Cited, All Databases',\n '180 Day Usage Count', 'Since 2013 Usage Count', 'Publisher',\n 'Publisher City', 'Publisher Address', 'ISSN', 'eISSN', 'ISBN',\n 'Journal Abbreviation', 'Journal ISO Abbreviation', 'Publication Date',\n 'Publication Year', 'Volume', 'Issue', 'Part Number', 'Supplement',\n 'Special Issue', 'Meeting Abstract', 'Start Page', 'End Page',\n 'Article Number', 'DOI', 'DOI Link', 'Book DOI', 'Early Access Date',\n 'Number of Pages', 'WoS Categories', 'Web of Science Index',\n 'Research Areas', 'IDS Number', 'Pubmed Id', 'Open Access Designations',\n 'Highly Cited Status', 'Hot Paper Status', 'Date of Export',\n 'UT (Unique WOS ID)', 'issn_var', 'issn', 'Domain_English',\n 'Field_English', 'SubField_English', '2.00 SEQ', 'Source_title',\n 'srcid', 'issn_type'],\n dtype='object')"
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos.columns"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"geotext = GeoText()\n",
"\n",
"def extract_location(input_text, key='countries'):\n",
" anomalies = {\"Malta\":\"Malta\",\n",
" \"Mongolia\":\"Mongolia\",\n",
" \"Quatar\":\"Qatar\",\n",
" \"Qatar\":\"Qatar\",\n",
" \"Ethiop\":\"Ethiopia\",\n",
" \"Nigeria\":\"Nigeria\",\n",
" \"BELAR\":\"Belarus\",\n",
" \"Venezuela\":\"Venezuela\",\n",
" \"Cyprus\":\"Cyprus\",\n",
" \"Ecuador\":\"Ecuador\",\n",
" \"U Arab\":\"United Arab Emirates\",\n",
" \"Syria\":\"Syria\",\n",
" \"Uganda\":\"Uganda\",\n",
" \"Yemen\":\"Yemen\",\n",
" \"Mali\":\"Mali\",\n",
" \"Senegal\":\"Senegal\",\n",
" \"Vatican\":\"Vatican\",\n",
" \"Uruguay\":\"Uruguay\",\n",
" \"Panama\":\"Panama\",\n",
" \"Fiji\":\"Fiji\",\n",
" \"Faroe\":\"Faroe Islands\",\n",
" \"Macedonia\":\"Macedonia\",\n",
" 'Mozambique':'Mozambique',\n",
" \"Kuwait\":\"Kuwait\",\n",
" \"Libya\":\"Libya\",\n",
" \"Turkiy\":\"Turkey\",\n",
" \"Liberia\":\"Liberia\",\n",
" \"Namibia\":\"Namibia\",\n",
" \"Ivoire\":\"Ivory Coast\",\n",
" \"Guatemala\":\"Gutemala\",\n",
" \"Paraguay\":\"Paraguay\",\n",
" \"Honduras\":\"Honduras\",\n",
" \"Nicaragua\":\"Nicaragua\",\n",
" \"Trinidad\":\"Trinidad & Tobago\",\n",
" \"Liechtenstein\":\"Liechtenstein\",\n",
" \"Greenland\":\"Denmark\"}\n",
"\n",
" extracted = geotext.extract(input_text=input_text)\n",
" found = extracted[key].keys()\n",
" if len(sorted(found))>0:\n",
" return sorted(found)[0]\n",
" elif key=='countries':\n",
" for i in ['Scotland','Wales','England', 'N Ireland']:\n",
" if i in input_text:\n",
" return 'United Kingdom'\n",
" for j in anomalies.keys():\n",
" if j in input_text:\n",
" return anomalies.get(j)\n",
" else:\n",
" return None\n",
"\n",
"with open('../eu_members.txt',\"r\") as f:\n",
" eu_countries=f.readline().split(\",\")\n",
" eu_countries=[i.strip() for i in eu_countries]\n",
"\n",
"def country_cleanup(country):\n",
" if \"USA\" in country:\n",
" return \"USA\"\n",
" elif \"China\" in country:\n",
" return \"China\"\n",
" elif country in [\"England\", \"Northern Ireland\", \"Wales\", \"Scotland\",\"N Ireland\"]:\n",
" return \"United Kingdom\"\n",
" else:\n",
" return country\n",
"\n",
"\n",
"def country_type(country):\n",
" if country in eu_countries:\n",
" return \"EU\"\n",
" elif country==\"China\":\n",
" return \"China\"\n",
" elif country in [\"Switzerland\", 'Norway','United Kingdom']:\n",
" return \"Non-EU associate\"\n",
" else:\n",
" return \"Other\"\n"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"locations = wos.groupby(record_col)[\"Addresses\"].apply(lambda x: x.str.split('[')).explode().reset_index().drop(columns=\"level_1\")\n",
"locations = locations[locations[\"Addresses\"]!=\"\"].copy()\n",
"locations[\"Address\"] = locations[\"Addresses\"].apply(lambda x:x.split(\"]\")[-1])\n",
"locations[\"Authors_of_address\"] = locations[\"Addresses\"].apply(lambda x:x.split(\"]\")[0])\n",
"locations[\"Country\"]=locations['Address'].apply(lambda x: extract_location(input_text=x, key='countries'))\n",
"locations[\"Country_split\"]=locations['Address'].apply(lambda x: x.split(\",\")[-1].strip(\" \").strip(\";\").strip(\" \"))\n",
"locations[\"Country_split\"]=locations['Country_split'].apply(lambda x: country_cleanup(x))\n",
"locations[\"City\"]=locations['Address'].apply(lambda x: extract_location(input_text=x, key='cities'))\n",
"locations[\"Country_Type\"] = locations[\"Country\"].apply(lambda x: country_type(x))"
]
},
{
"cell_type": "code",
"execution_count": 17,
"outputs": [
{
"data": {
"text/plain": "Country_split\n Peoples R China; 63061\n England; 13544\n Peoples R China 11696\n Germany; 7727\n Italy; 6942\n ... \n NJ 08854 USA 1\n ID 83707 USA; 1\n AL 36832 USA; 1\n MT 59812 USA 1\n FL 34787 USA; 1\nName: count, Length: 1907, dtype: int64"
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"locations[\"Country_split\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 11,
"outputs": [],
"source": [
"scope_types = [\"EU\",\"China\",\"Non-EU associate\"]\n",
"locations=locations[locations[\"Country_Type\"].isin(scope_types)]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Address \n1 WOS:000208863600013 Radboud Univ Nijmegen, Dept Microbiol, Inst W... \\\n2 WOS:000208863600013 Zhejiang Univ, Dept Environm Engn, Hangzhou 3... \n3 WOS:000208863600013 Radboud Univ Nijmegen, Dept Mol Biol, Nijmege... \n4 WOS:000208863600013 Delft Univ Technol, Dept Biotechnol, Delft, N... \n6 WOS:000208863600266 Univ Bergen, Ctr Geobiol, Dept Biol, N-5020 B... \n\n Country City Country_Type Institution \n1 Netherlands Nijmegen EU Radboud Univ Nijmegen \n2 China Hangzhou China Zhejiang Univ \n3 Netherlands Mol EU Radboud Univ Nijmegen \n4 Netherlands Delft EU Delft Univ Technol \n6 Norway Bergen Non-EU associate Univ Bergen ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Address</th>\n <th>Country</th>\n <th>City</th>\n <th>Country_Type</th>\n <th>Institution</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>1</th>\n <td>WOS:000208863600013</td>\n <td>Radboud Univ Nijmegen, Dept Microbiol, Inst W...</td>\n <td>Netherlands</td>\n <td>Nijmegen</td>\n <td>EU</td>\n <td>Radboud Univ Nijmegen</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000208863600013</td>\n <td>Zhejiang Univ, Dept Environm Engn, Hangzhou 3...</td>\n <td>China</td>\n <td>Hangzhou</td>\n <td>China</td>\n <td>Zhejiang Univ</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000208863600013</td>\n <td>Radboud Univ Nijmegen, Dept Mol Biol, Nijmege...</td>\n <td>Netherlands</td>\n <td>Mol</td>\n <td>EU</td>\n <td>Radboud Univ Nijmegen</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000208863600013</td>\n <td>Delft Univ Technol, Dept Biotechnol, Delft, N...</td>\n <td>Netherlands</td>\n <td>Delft</td>\n <td>EU</td>\n <td>Delft Univ Technol</td>\n </tr>\n <tr>\n <th>6</th>\n <td>WOS:000208863600266</td>\n <td>Univ Bergen, Ctr Geobiol, Dept Biol, N-5020 B...</td>\n <td>Norway</td>\n <td>Bergen</td>\n <td>Non-EU associate</td>\n <td>Univ Bergen</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"univ_locations = locations[[record_col,\"Address\",\"Country\",\"City\",\"Country_Type\"]].copy()\n",
"univ_locations[\"Institution\"] = univ_locations[\"Address\"].apply(lambda x: x.split(\",\")[0])\n",
"univ_locations = univ_locations.drop_duplicates()\n",
"univ_locations.head()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Country Country_Type \n0 WOS:000208863600013 China China \\\n1 WOS:000208863600013 Netherlands EU \n2 WOS:000208863600013 Netherlands EU \n3 WOS:000208863600013 Netherlands EU \n4 WOS:000208863600013 Netherlands EU \n\n author_str_id \n0 54c7bc6fe9b77434ca1bf04d763d843b \n1 6a775fcd8d11fcb084671b8cae4d6305 \n2 aa6accfdf7626441fe9191636dab4c35 \n3 b707b51d1ca3b5aa76de6ce6df20e6e4 \n4 df81f9da6c8f5c968c16ef0aab1bb8f9 ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Country</th>\n <th>Country_Type</th>\n <th>author_str_id</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000208863600013</td>\n <td>China</td>\n <td>China</td>\n <td>54c7bc6fe9b77434ca1bf04d763d843b</td>\n </tr>\n <tr>\n <th>1</th>\n <td>WOS:000208863600013</td>\n <td>Netherlands</td>\n <td>EU</td>\n <td>6a775fcd8d11fcb084671b8cae4d6305</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000208863600013</td>\n <td>Netherlands</td>\n <td>EU</td>\n <td>aa6accfdf7626441fe9191636dab4c35</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000208863600013</td>\n <td>Netherlands</td>\n <td>EU</td>\n <td>b707b51d1ca3b5aa76de6ce6df20e6e4</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000208863600013</td>\n <td>Netherlands</td>\n <td>EU</td>\n <td>df81f9da6c8f5c968c16ef0aab1bb8f9</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"author_locations = locations.groupby([record_col,\"Country\",\"Country_Type\"])[\"Authors_of_address\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_3\")\n",
"author_locations[\"Author_name\"] = author_locations[\"Authors_of_address\"].str.strip()\n",
"author_locations = author_locations.drop(columns=\"Authors_of_address\")\n",
"author_locations[\"author_str_id\"] = author_locations[\"Author_name\"].apply(lambda x:''.join(filter(str.isalnum, x.lower())))\n",
"author_locations[\"author_str_id\"] = author_locations[\"author_str_id\"].apply(md5hash)\n",
"author_locations = author_locations.drop(columns=\"Author_name\")\n",
"author_locations.head()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Country Country_Type \n0 WOS:000208863600013 China China \\\n4 WOS:000208863600013 Netherlands EU \n6 WOS:000208863600013 Netherlands EU \n7 WOS:000208863600266 China China \n13 WOS:000208863900217 China China \n... ... ... ... \n441911 WOS:000951829800021 China China \n441912 WOS:000951829800021 Netherlands EU \n441913 WOS:000952055000007 China China \n441914 WOS:000952055000007 China China \n441916 WOS:000952055000007 United Kingdom Non-EU associate \n\n author_str_id \n0 54c7bc6fe9b77434ca1bf04d763d843b \n4 df81f9da6c8f5c968c16ef0aab1bb8f9 \n6 df81f9da6c8f5c968c16ef0aab1bb8f9 \n7 5dfb4f0408a2cc8b7f36f5516938b62c \n13 00e44aa0a23a3fc9571b1053a4453a54 \n... ... \n441911 fc15bf7c800877e1c33f4a7397840faa \n441912 6b8763361150d7c3ceecf9eca9efd83b \n441913 80231479c1502ce8649717236023b6c9 \n441914 0af23824e538b0816c19239079d58c77 \n441916 b77dd6bc0ae30a2f96d43eebb1b3d89a \n\n[387172 rows x 4 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Country</th>\n <th>Country_Type</th>\n <th>author_str_id</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000208863600013</td>\n <td>China</td>\n <td>China</td>\n <td>54c7bc6fe9b77434ca1bf04d763d843b</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000208863600013</td>\n <td>Netherlands</td>\n <td>EU</td>\n <td>df81f9da6c8f5c968c16ef0aab1bb8f9</td>\n </tr>\n <tr>\n <th>6</th>\n <td>WOS:000208863600013</td>\n <td>Netherlands</td>\n <td>EU</td>\n <td>df81f9da6c8f5c968c16ef0aab1bb8f9</td>\n </tr>\n <tr>\n <th>7</th>\n <td>WOS:000208863600266</td>\n <td>China</td>\n <td>China</td>\n <td>5dfb4f0408a2cc8b7f36f5516938b62c</td>\n </tr>\n <tr>\n <th>13</th>\n <td>WOS:000208863900217</td>\n <td>China</td>\n <td>China</td>\n <td>00e44aa0a23a3fc9571b1053a4453a54</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>441911</th>\n <td>WOS:000951829800021</td>\n <td>China</td>\n <td>China</td>\n <td>fc15bf7c800877e1c33f4a7397840faa</td>\n </tr>\n <tr>\n <th>441912</th>\n <td>WOS:000951829800021</td>\n <td>Netherlands</td>\n <td>EU</td>\n <td>6b8763361150d7c3ceecf9eca9efd83b</td>\n </tr>\n <tr>\n <th>441913</th>\n <td>WOS:000952055000007</td>\n <td>China</td>\n <td>China</td>\n <td>80231479c1502ce8649717236023b6c9</td>\n </tr>\n <tr>\n <th>441914</th>\n <td>WOS:000952055000007</td>\n <td>China</td>\n <td>China</td>\n <td>0af23824e538b0816c19239079d58c77</td>\n </tr>\n <tr>\n <th>441916</th>\n <td>WOS:000952055000007</td>\n <td>United Kingdom</td>\n <td>Non-EU associate</td>\n <td>b77dd6bc0ae30a2f96d43eebb1b3d89a</td>\n </tr>\n </tbody>\n</table>\n<p>387172 rows × 4 columns</p>\n</div>"
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"author_locations[author_locations['author_str_id'].duplicated(False)]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"author_primary_region = author_locations.sort_values(by=\"Country_Type\").drop_duplicates(subset=[record_col,\"author_str_id\"])\n",
"# author_primary_region\n",
"\n",
"china=author_primary_region[author_primary_region[\"Country_Type\"]==\"China\"][record_col].unique()\n",
"eu=author_primary_region[author_primary_region[\"Country_Type\"]==\"EU\"][record_col].unique()\n",
"assoc=author_primary_region[author_primary_region[\"Country_Type\"]==\"Non-EU associate\"][record_col].unique()\n",
"\n",
"\n",
"# records that have distinct authors with different country affiliations\n",
"valid_scope = wos[((wos[record_col].isin(china))\n",
" &\n",
" ((wos[record_col].isin(eu))\n",
" |\n",
" (wos[record_col].isin(assoc))))][record_col].unique()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Country Country_Type \n0 WOS:000208863600013 China China \\\n304939 WOS:000648878200015 China China \n304935 WOS:000648805900001 China China \n304934 WOS:000648805900001 China China \n304933 WOS:000648805900001 China China \n\n author_str_id \n0 54c7bc6fe9b77434ca1bf04d763d843b \n304939 043a846fd3ea05c308e9944b984b8d8f \n304935 4132592fad8ecaa0bc99a8148c348f45 \n304934 0bcfdc30b9929c5513eaabfe484ffd26 \n304933 3d5c738679e81c68cc67a06ecc686851 ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Country</th>\n <th>Country_Type</th>\n <th>author_str_id</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000208863600013</td>\n <td>China</td>\n <td>China</td>\n <td>54c7bc6fe9b77434ca1bf04d763d843b</td>\n </tr>\n <tr>\n <th>304939</th>\n <td>WOS:000648878200015</td>\n <td>China</td>\n <td>China</td>\n <td>043a846fd3ea05c308e9944b984b8d8f</td>\n </tr>\n <tr>\n <th>304935</th>\n <td>WOS:000648805900001</td>\n <td>China</td>\n <td>China</td>\n <td>4132592fad8ecaa0bc99a8148c348f45</td>\n </tr>\n <tr>\n <th>304934</th>\n <td>WOS:000648805900001</td>\n <td>China</td>\n <td>China</td>\n <td>0bcfdc30b9929c5513eaabfe484ffd26</td>\n </tr>\n <tr>\n <th>304933</th>\n <td>WOS:000648805900001</td>\n <td>China</td>\n <td>China</td>\n <td>3d5c738679e81c68cc67a06ecc686851</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"author_primary_region.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of records: 35663\n",
"Number of valid cooperation records: 31861\n"
]
}
],
"source": [
"print(f'Number of records: {len(wos)}')\n",
"print(f'Number of valid cooperation records: {len(valid_scope)}')"
]
},
{
"cell_type": "code",
"execution_count": 18,
"outputs": [],
"source": [
"wos = wos[wos[record_col].isin(valid_scope)]\n",
"locations = locations[locations[record_col].isin(valid_scope)]\n",
"univ_locations = univ_locations[univ_locations[record_col].isin(valid_scope)]\n",
"author_locations = author_locations[author_locations[record_col].isin(valid_scope)]\n",
"author_primary_region = author_locations[author_locations[record_col].isin(valid_scope)]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"affiliations = wos.groupby(record_col)[\"Affiliations\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
"affiliations[\"Affiliations\"] = affiliations[\"Affiliations\"].str.strip().str.upper().fillna(\"UNKNOWN\")\n",
"affiliations = affiliations.drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"outputs": [
{
"data": {
"text/plain": "Affiliations\nCHINESE ACADEMY OF SCIENCES 3623\nUNIVERSITY OF LONDON 1729\nUDICE-FRENCH RESEARCH UNIVERSITIES 1421\nTSINGHUA UNIVERSITY 1347\nCENTRE NATIONAL DE LA RECHERCHE SCIENTIFIQUE (CNRS) 1330\n ... \nFRESHWATER FISHERIES RESEARCH CENTER, CAFS 1\nHEILONGJIANG RIVER FISHERIES RESEARCH INSTITUTE, CAFS 1\nINSTITUTE OF METEOROLOGY & WATER MANAGEMENT 1\nFEDERAL MINISTRY OF HEALTH - ETHIOPIA (FMOH) 1\nTANGSHAN UNIVERSITY 1\nName: count, Length: 6784, dtype: int64"
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"affiliations[\"Affiliations\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 21,
"outputs": [
{
"data": {
"text/plain": "Institution\n Chinese Acad Sci 3618\n Tsinghua Univ 1633\n Shanghai Jiao Tong Univ 1372\n Zhejiang Univ 1288\n Univ Elect Sci & Technol China 969\n ... \n Ludwig Boltzmann Inst Clin Forens Imaging 1\n Royal Brampton Hosp 1\n Inst Spacecraft Syst Engn CAST 1\n Sevalo Construct Machinery Remfg Co Ltd 1\n Int Digital Econ Acad 1\nName: count, Length: 14546, dtype: int64"
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"univ_locations[\"Institution\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 22,
"outputs": [
{
"data": {
"text/plain": "31861"
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"univ_locations[record_col].nunique()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 23,
"outputs": [
{
"data": {
"text/plain": "31861"
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"affiliations[record_col].nunique()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 24,
"outputs": [
{
"data": {
"text/plain": "138559"
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"univ_locations[\"Institution\"].value_counts().sum()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 25,
"outputs": [
{
"data": {
"text/plain": "181832"
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"affiliations[\"Affiliations\"].value_counts().sum()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "WoS Categories\n Engineering, Electrical & Electronic 6066\nComputer Science, Artificial Intelligence 4859\nComputer Science, Information Systems 3740\n Telecommunications 3304\nEngineering, Electrical & Electronic 2451\n ... \n Criminology & Penology 1\nArea Studies 1\nMaterials Science, Paper & Wood 1\n Emergency Medicine 1\n Geology 1\nName: count, Length: 415, dtype: int64"
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos_cat = wos.groupby(record_col)[\"WoS Categories\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
"wos_cat[\"WoS Categories\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "Research Areas\nEngineering 12815\nComputer Science 12386\nTelecommunications 3577\nImaging Science & Photographic Technology 1949\nEnvironmental Sciences & Ecology 1887\n ... \nMusic 1\nAsian Studies 1\nCultural Studies 1\nArea Studies 1\nEmergency Medicine 1\nName: count, Length: 145, dtype: int64"
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos_areas = wos.groupby(record_col)[\"Research Areas\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
"wos_areas[\"Research Areas\"] = wos_areas[\"Research Areas\"].str.strip()\n",
"wos_areas[\"Research Areas\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "['Domain_English', 'Field_English', 'SubField_English']"
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[c for c in wos.columns if \"_English\" in c]"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"metrix_levels = [c for c in wos.columns if \"_English\" in c]\n",
"for m in metrix_levels:\n",
" wos[m] = wos[m].replace({\"article-level classification\":\"Miscellaneous\"})\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"wos"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"metrix_levels"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"outdir=\"wos_processed_data\""
]
},
{
"cell_type": "code",
"execution_count": 80,
"outputs": [],
"source": [
"record_countries = locations[[record_col,\"Country\"]].drop_duplicates()\n",
"record_author_locations = author_locations[[record_col,\"author_str_id\",\"Country\"]].drop_duplicates()\n",
"record_institution = univ_locations[[record_col,\"Institution\",\"Country\"]].drop_duplicates()\n",
"country_types = locations[[\"Country\",\"Country_Type\"]].drop_duplicates()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 81,
"outputs": [],
"source": [
"country_collabs = record_countries.merge(record_countries, on=record_col)\n",
"country_collabs = country_collabs[country_collabs[\"Country_x\"]!=country_collabs[\"Country_y\"]]\n",
"country_collabs[\"weight\"] = 0.5"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 82,
"outputs": [],
"source": [
"inst_collabs = record_institution.merge(record_institution, on=record_col)\n",
"inst_collabs = inst_collabs[inst_collabs[\"Institution_x\"]!=inst_collabs[\"Institution_y\"]]\n",
"inst_collabs[\"weight\"] = 0.5"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 85,
"outputs": [
{
"data": {
"text/plain": "Index(['Publication Type', 'Authors', 'Book Authors', 'Book Editors',\n 'Book Group Authors', 'Author Full Names', 'Book Author Full Names',\n 'Group Authors', 'Article Title', 'Source Title', 'Book Series Title',\n 'Book Series Subtitle', 'Language', 'Document Type', 'Conference Title',\n 'Conference Date', 'Conference Location', 'Conference Sponsor',\n 'Conference Host', 'Author Keywords', 'Keywords Plus', 'Abstract',\n 'Addresses', 'Affiliations', 'Reprint Addresses', 'Email Addresses',\n 'Researcher Ids', 'ORCIDs', 'Funding Orgs', 'Funding Name Preferred',\n 'Funding Text', 'Cited References', 'Cited Reference Count',\n 'Times Cited, WoS Core', 'Times Cited, All Databases',\n '180 Day Usage Count', 'Since 2013 Usage Count', 'Publisher',\n 'Publisher City', 'Publisher Address', 'ISSN', 'eISSN', 'ISBN',\n 'Journal Abbreviation', 'Journal ISO Abbreviation', 'Publication Date',\n 'Publication Year', 'Volume', 'Issue', 'Part Number', 'Supplement',\n 'Special Issue', 'Meeting Abstract', 'Start Page', 'End Page',\n 'Article Number', 'DOI', 'DOI Link', 'Book DOI', 'Early Access Date',\n 'Number of Pages', 'WoS Categories', 'Web of Science Index',\n 'Research Areas', 'IDS Number', 'Pubmed Id', 'Open Access Designations',\n 'Highly Cited Status', 'Hot Paper Status', 'Date of Export',\n 'UT (Unique WOS ID)', 'issn_var', 'issn', 'Domain_English',\n 'Field_English', 'SubField_English', '2.00 SEQ', 'Source_title',\n 'srcid', 'issn_type'],\n dtype='object')"
},
"execution_count": 85,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos.columns"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 91,
"outputs": [
{
"data": {
"text/plain": "['Authors',\n 'Book Authors',\n 'Book Editors',\n 'Book Group Authors',\n 'Author Full Names',\n 'Book Author Full Names',\n 'Group Authors',\n 'Addresses',\n 'Reprint Addresses',\n 'Email Addresses',\n 'Researcher Ids',\n 'ORCIDs',\n 'Publisher Address',\n '2.00 SEQ']"
},
"execution_count": 91,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"drop_cols = [ws for ws in wos.columns if ((\"uthor\" in ws or \"ddress\" in ws or \"ORCID\" in\n",
" ws or \"esearcher\" in ws or \"ditor\" in ws or \"name\" in ws or 'SEQ' in ws) and \"eyword\" not in ws)]\n",
"drop_cols"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 88,
"outputs": [],
"source": [
"os.makedirs(outdir, exist_ok=True)\n",
"\n",
"wos.drop(columns=drop_cols).to_excel(f\"{outdir}/wos_processed.xlsx\", index=False)\n",
"\n",
"record_countries.to_excel(f\"{outdir}/wos_countries.xlsx\", index=False)\n",
"\n",
"record_author_locations.to_excel(f\"{outdir}/wos_author_locations.xlsx\", index=False)\n",
"\n",
"record_institution.to_excel(f\"{outdir}/wos_institution_locations.xlsx\", index=False)\n",
"\n",
"kw_df.to_excel(f\"{outdir}/wos_keywords.xlsx\", index=False)\n",
"\n",
"country_types.to_excel(f\"{outdir}/wos_country_types.xlsx\", index=False)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 92,
"outputs": [],
"source": [
"wos.drop(columns=drop_cols).to_csv(f\"{outdir}/wos_processed.csv\", index=False, sep='\\t')\n",
"\n",
"record_countries.to_csv(f\"{outdir}/wos_countries.csv\", index=False, sep='\\t')\n",
"\n",
"record_author_locations.to_csv(f\"{outdir}/wos_author_locations.csv\", index=False, sep='\\t')\n",
"\n",
"record_institution.to_csv(f\"{outdir}/wos_institution_locations.csv\", index=False, sep='\\t')\n",
"\n",
"kw_df.to_csv(f\"{outdir}/wos_keywords.csv\", index=False, sep='\\t')\n",
"\n",
"country_types.to_csv(f\"{outdir}/wos_country_types.csv\", index=False, sep='\\t')\n",
"\n",
"inst_collabs.to_csv(f\"{outdir}/wos_inst_collabs.csv\", index=False, sep='\\t')\n",
"\n",
"country_collabs.to_csv(f\"{outdir}/wos_country_collabs.csv\", index=False, sep='\\t')"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# Basic network layout"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"# Simple NLP part"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 32,
"outputs": [
{
"data": {
"text/plain": "<Axes: ylabel='Frequency'>"
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAk0AAAGdCAYAAAAPLEfqAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAs8klEQVR4nO3de3RU5b3/8U8gJAQkE25JyI8AqSCXgnK4NEaRlpJDkGjl0h6psaCkUDRREBGDl1TFNhAVAbWkHhFwCYqcA0jhgMSgcKoRJBq5VANoMFAygRqS4SIhJPv3Byd7MSTAQxiYyfB+rbXXYvZ+5pnvN5u95rP27NkTYFmWJQAAAFxQI28XAAAA0BAQmgAAAAwQmgAAAAwQmgAAAAwQmgAAAAwQmgAAAAwQmgAAAAwQmgAAAAwEersAf1FdXa2DBw+qRYsWCggI8HY5AADAgGVZOnr0qKKiotSo0YXPJRGaPOTgwYOKjo72dhkAAKAe9u/fr/bt219wDKHJQ1q0aCHpzB89NDTUy9UAAAATLpdL0dHR9vv4hRCaPKTmI7nQ0FBCEwAADYzJpTVcCA4AAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGAg0NsFALj2dEpb6+0SLtm+mYneLgGAl3GmCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwIBXQ9PmzZt15513KioqSgEBAVq1apXbdsuylJ6ernbt2ikkJETx8fHas2eP25jS0lIlJSUpNDRUYWFhSk5O1rFjx9zGbN++XbfddpuaNm2q6OhoZWZm1qpl+fLl6tatm5o2bapevXrpf/7nfzzeLwAAaLi8GpqOHz+um266Sa+99lqd2zMzMzVv3jxlZWVpy5Ytat68uRISEnTy5El7TFJSknbt2qXs7GytWbNGmzdv1oQJE+ztLpdLQ4YMUceOHZWXl6cXXnhBzzzzjF5//XV7zKeffqrf/va3Sk5O1pdffqnhw4dr+PDh2rlz55VrHgAANCgBlmVZ3i5CkgICArRy5UoNHz5c0pmzTFFRUXr00Uc1depUSVJ5ebkiIiK0aNEijR49Wl9//bV69Oihzz//XP369ZMkrV+/XsOGDdOBAwcUFRWl+fPn68knn5TT6VRQUJAkKS0tTatWrdI333wjSbr77rt1/PhxrVmzxq7n5ptvVu/evZWVlWVUv8vlksPhUHl5uUJDQz31ZwH8Uqe0td4u4ZLtm5no7RIAXAGX8v7ts9c0FRYWyul0Kj4+3l7ncDgUGxur3NxcSVJubq7CwsLswCRJ8fHxatSokbZs2WKPGThwoB2YJCkhIUEFBQU6cuSIPebs16kZU/M6damoqJDL5XJbAACA//LZ0OR0OiVJERERbusjIiLsbU6nU+Hh4W7bAwMD1apVK7cxdc1x9mucb0zN9rpkZGTI4XDYS3R09KW2CAAAGhCfDU2+bvr06SovL7eX/fv3e7skAABwBflsaIqMjJQklZSUuK0vKSmxt0VGRurQoUNu20+fPq3S0lK3MXXNcfZrnG9Mzfa6BAcHKzQ01G0BAAD+y2dDU0xMjCIjI5WTk2Ovc7lc2rJli+Li4iRJcXFxKisrU15enj1m48aNqq6uVmxsrD1m8+bNqqystMdkZ2era9euatmypT3m7NepGVPzOgAAAF4NTceOHVN+fr7y8/Mlnbn4Oz8/X0VFRQoICNDkyZP1/PPPa/Xq1dqxY4fGjBmjqKgo+xt23bt319ChQzV+/Hht3bpVn3zyiVJTUzV69GhFRUVJku655x4FBQUpOTlZu3bt0rJlyzR37lxNmTLFrmPSpElav369XnrpJX3zzTd65plntG3bNqWmpl7tPwkAAPBRgd588W3btmnQoEH245ogM3bsWC1atEjTpk3T8ePHNWHCBJWVlWnAgAFav369mjZtaj9nyZIlSk1N1eDBg9WoUSONGjVK8+bNs7c7HA5t2LBBKSkp6tu3r9q0aaP09HS3ezndcsstWrp0qZ566ik98cQT6tKli1atWqWePXtehb8CAABoCHzmPk0NHfdpAsxxnyYAvsIv7tMEAADgSwhNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABghNAAAABgK9XQAANASd0tZ6u4RLtm9mordLAPwKZ5oAAAAMEJoAAAAMEJoAAAAMEJoAAAAMEJoAAAAMEJoAAAAMEJoAAAAMEJoAAAAMEJoAAAAMEJoAAAAMEJoAAAAMEJoAAAAMEJoAAAAMEJoAAAAMEJoAAAAMEJoAAAAM+HRoqqqq0tNPP62YmBiFhITo+uuv14wZM2RZlj3Gsiylp6erXbt2CgkJUXx8vPbs2eM2T2lpqZKSkhQaGqqwsDAlJyfr2LFjbmO2b9+u2267TU2bNlV0dLQyMzOvSo8AAKBh8OnQNGvWLM2fP1+vvvqqvv76a82aNUuZmZl65ZVX7DGZmZmaN2+esrKytGXLFjVv3lwJCQk6efKkPSYpKUm7du1Sdna21qxZo82bN2vChAn2dpfLpSFDhqhjx47Ky8vTCy+8oGeeeUavv/76Ve0XAAD4rgDr7NM2PuaOO+5QRESEFixYYK8bNWqUQkJC9Pbbb8uyLEVFRenRRx/V1KlTJUnl5eWKiIjQokWLNHr0aH399dfq0aOHPv/8c/Xr10+StH79eg0bNkwHDhxQVFSU5s+fryeffFJOp1NBQUGSpLS0NK1atUrffPONUa0ul0sOh0Pl5eUKDQ318F8C8C+d0tZ6u4Rrwr6Zid4uAfB5l/L+7dNnmm655Rbl5ORo9+7dkqSvvvpKf//733X77bdLkgoLC+V0OhUfH28/x+FwKDY2Vrm5uZKk3NxchYWF2YFJkuLj49WoUSNt2bLFHjNw4EA7MElSQkKCCgoKdOTIkTprq6iokMvlclsAAID/CvR2AReSlpYml8ulbt26qXHjxqqqqtKf/vQnJSUlSZKcTqckKSIiwu15ERER9jan06nw8HC37YGBgWrVqpXbmJiYmFpz1Gxr2bJlrdoyMjL07LPPeqBLAADQEPj0mab33ntPS5Ys0dKlS/XFF19o8eLFevHFF7V48WJvl6bp06ervLzcXvbv3+/tkgAAwBXk02eaHnvsMaWlpWn06NGSpF69eun7779XRkaGxo4dq8jISElSSUmJ2rVrZz+vpKREvXv3liRFRkbq0KFDbvOePn1apaWl9vMjIyNVUlLiNqbmcc2YcwUHBys4OPjymwQAAA2CT59pOnHihBo1ci+xcePGqq6uliTFxMQoMjJSOTk59naXy6UtW7YoLi5OkhQXF6eysjLl5eXZYzZu3Kjq6mrFxsbaYzZv3qzKykp7THZ2trp27VrnR3MAAODa49Oh6c4779Sf/vQnrV27Vvv27dPKlSs1e/ZsjRgxQpIUEBCgyZMn6/nnn9fq1au1Y8cOjRkzRlFRURo+fLgkqXv37ho6dKjGjx+vrVu36pNPPlFqaqpGjx6tqKgoSdI999yjoKAgJScna9euXVq2bJnmzp2rKVOmeKt1AADgY3z647lXXnlFTz/9tB588EEdOnRIUVFR+sMf/qD09HR7zLRp03T8+HFNmDBBZWV
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import spacy\n",
"\n",
"nlp = spacy.load(\"en_core_web_lg\")\n",
"wos_nlp = wos.merge(wos_kwd_concat, on=record_col)\n",
"wos_nlp[\"Document\"] = wos_nlp[\"Article Title\"].str.cat(wos_nlp[[\"Abstract\", \"keyword_all\"]].fillna(\"\"), sep=' - ')\n",
"# wos_kwd_test[\"BERT_KWDS\"] = wos_kwd_test[\"Document\"].map(kwd_extract)\n",
"\n",
"vectors = list()\n",
"vector_norms = list()\n",
"\n",
"for doc in nlp.pipe(wos_nlp['Document'].astype('unicode').values, batch_size=300,\n",
" n_process=4):\n",
" vectors.append(doc.vector)\n",
" vector_norms.append(doc.vector_norm)\n",
"\n",
"wos_nlp['vector'] = vectors\n",
"wos_nlp['vector_norm'] = vector_norms\n",
"wos_nlp['vector_norm'].plot(kind=\"hist\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 35,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) TNSE-X TNSE-Y\n0 WOS:000641589600020 131.783783 -4.202979\n1 WOS:000590197400003 74.897812 89.280334\n2 WOS:000510863400004 84.939049 23.416033\n3 WOS:000403039400031 -39.527546 54.230900\n4 WOS:000439363600016 -59.109379 72.877693",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>TNSE-X</th>\n <th>TNSE-Y</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000641589600020</td>\n <td>131.783783</td>\n <td>-4.202979</td>\n </tr>\n <tr>\n <th>1</th>\n <td>WOS:000590197400003</td>\n <td>74.897812</td>\n <td>89.280334</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000510863400004</td>\n <td>84.939049</td>\n <td>23.416033</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000403039400031</td>\n <td>-39.527546</td>\n <td>54.230900</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000439363600016</td>\n <td>-59.109379</td>\n <td>72.877693</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.manifold import TSNE\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"# % matplotlib inline\n",
"\n",
"vector_data = pd.DataFrame(wos_nlp[\"vector\"].to_list(), index=wos_nlp[record_col]).reset_index()\n",
"vector_data.head()\n",
"\n",
"labels = vector_data.values[:, 0]\n",
"record_vectors = vector_data.values[:, 1:]\n",
"\n",
"tsne_model = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, random_state=42, metric='cosine')\n",
"tnse_2d = tsne_model.fit_transform(record_vectors)\n",
"tnse_data = pd.DataFrame(tnse_2d, index=labels).reset_index()\n",
"tnse_data.columns = [record_col, \"TNSE-X\", \"TNSE-Y\"]\n",
"tnse_data.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 36,
"outputs": [
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1MAAAGwCAYAAABM9z+ZAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3gUxRvA8e/lUi6X3nslFVIIvYcqHQQERBARQVEBFVHEhqCCDUHFDoJIBwHpvXcSSIGQ3nvvyeXK/v44OIgJGFRsv/08Tx643dmd2c3l7t6bmXckgiAIiEQikUgkEolEIpHovuj93Q0QiUQikUgkEolEon8jMZgSiUQikUgkEolEot9BDKZEIpFIJBKJRCKR6HcQgymRSCQSiUQikUgk+h3EYEokEolEIpFIJBKJfgcxmBKJRCKRSCQSiUSi30EMpkQikUgkEolEIpHod9D/uxvwb6PRaMjNzcXMzAyJRPJ3N0ckEolEIlELCIJAVVUVzs7O6Ok9+O+S1Wo1SqXygdcjEon+XAYGBkil0haXF4Op+5Sbm4ubm9vf3QyRSCQSiUS/Q1ZWFq6urg/s/IIgkJ+fT3l5+QOrQyQSPViWlpY4Ojq2qONEDKbuk5mZGaB9MTY3N/+bWyMSiUQikaglKisrcXNz072PPyi3Ail7e3vkcrk4ikUk+hcRBIHa2loKCwsBcHJy+s1jxGDqPt16UTQ3NxeDKZFIJBKJ/mUeZHCjVqt1gZSNjc0Dq0ckEj04xsbGABQWFmJvb/+bQ/7EBBQikUgkEolEf4Jbc6Tkcvnf3BKRSPRH3Pobbsm8RzGYEolEIpFIJPoTiUP7RKJ/t/v5GxaDKZFIJBKJRCKRSCT6HcRgSiQSiUQikUgkEol+BzGYEolEIpFIJBL9Zd555x3atm2rezxlyhQefvjhP3TOEydOIJFIHmhK+j+jnaL/HjGYEolEIpFIJPqHUWsEzqeU8EtUDudTSlBrhL+k3vPnzyOVShk6dOhfUh/AZ599xpo1ax54PdHR0YwYMQJ7e3tkMhmenp6MHz9elwb7t/xV7RT9u4ip0UUikUgkEon+QQ5cy2Ph7jjyKup125wsZCwY3ppBQb+97s0fsWrVKmbNmsWqVavIzc3F2dn5gdYHYGFh8cDrKCoqol+/fgwbNoyDBw9iaWlJeno6u3btoqampkXn+CvaKfr3EXumRCKRSCQSif4hDlzL49l1VxoFUgD5FfU8u+4KB67lPbC6q6ur2bx5M88++yxDhw5t0gtzayjd3r17CQkJQSaT0aVLF65du6Yrs2bNGiwtLdm5cye+vr7IZDIGDhxIVlbWXev99fA5jUbDkiVL8PLywtjYmNDQULZt29bomH379uHn54exsTF9+vQhPT39ntd29uxZKioqWLlyJWFhYXh5edGnTx+WLVuGl5eXrtz169cZNmwY5ubmmJmZ0bNnT1JSUn5XO2/dr6NHj9KhQwfkcjndunUjISGhUdt2795Nx44dkclk2NraMmrUKN0+hULB3LlzcXFxwcTEhM6dO3PixAnd/oyMDIYPH46VlRUmJia0adOGffv23fNeiP5cYjAlEolEIpFI9A+g1ggs3B1HcwP6bm1buDvugQ3527JlCwEBAfj7+zNp0iR++OEHBKFpXa+88gpLly7l8uXL2NnZMXz48Ebr8dTW1vL++++zdu1azp49S3l5OY8++miL27FkyRLWrl3LN998w/Xr13nppZeYNGkSJ0+eBCArK4vRo0czfPhwoqKimDZtGq+99to9z+no6IhKpWLHjh3NXhNATk4OvXr1wsjIiGPHjhEZGcnUqVNRqVS/q523vPHGGyxdupSIiAj09fWZOnWqbt/evXsZNWoUQ4YM4erVqxw9epROnTrp9s+cOZPz58+zadMmYmJiGDt2LIMGDSIpKQmA559/HoVCwalTp4iNjeXDDz/E1NT0t2+y6E8jDvMTiUQikUgk+ge4lFbapEfqTgKQV1HPpbRSuray+dPrX7VqFZMmTQJg0KBBVFRUcPLkSXr37t2o3IIFCxgwYAAAP/74I66uruzYsYNx48YB2oVOV6xYQefOnXVlAgMDuXTpUqNAoTkKhYLFixdz5MgRunbtCoC3tzdnzpzh22+/JTw8nK+//ppWrVqxdOlSAPz9/XWBxN106dKF119/nccee4wZM2bQqVMn+vbty+TJk3FwcADgyy+/xMLCgk2bNmFgYACAn5/f727nLe+//77u8WuvvcbQoUOpr69HJpPx/vvv8+ijj7Jw4UJd+dDQUAAyMzNZvXo1mZmZuuGWc+fO5cCBA6xevZrFixeTmZnJmDFjCA4O1rVB9NcSe6ZEIpFIJBKJ/gEKq+4eSP2ecvcjISGBS5cuMWHCBAD09fUZP348q1atalL2VvAAYG1tjb+/Pzdu3NBt09fXp2PHjrrHAQEBWFpaNipzN8nJydTW1jJgwABMTU11P2vXrtUNt7tx44YuUGuuTXfz/vvvk5+fzzfffEObNm345ptvCAgIIDY2FoCoqCh69uypC6T+aDtvCQkJ0f3fyUk75+1W0ouoqCj69evXbB2xsbGo1Wr8/Pwa1XHy5EldHbNnz+a9996je/fuLFiwgJiYmN9su+jPJfZMiUQikUgkEv0D2JvJ/tRy92PVqlWoVKpGCScEQcDIyIgVK1b8ZckXqqurAe3wNxcXl0b7jIyM/vD5bWxsGDt2LGPHjmXx4sWEhYXxySef8OOPP2JsbPxA2nlncCaRSADtfCvgnnVWV1cjlUqJjIxEKpU22ndrKN+0adMYOHAge/fu5dChQyxZsoSlS5cya9asFl+L6I8Re6ZEIpFIJBKJ/gE6eVnjZCFDcpf9ErRZ/Tp5Wf+p9apUKtauXcvSpUuJiorS/URHR+Ps7MzGjRsblb9w4YLu/2VlZSQmJhIYGNjofBEREbrHCQkJlJeXNypzN61bt8bIyIjMzEx8fHwa/bi5uQHohgzerU0tZWhoSKtWrXTZ/EJCQjh9+nSj+V9/pJ0tERISwtGjR5vdFxYWhlqtprCwsEkdjo6OunJubm7MmDGD7du38/LLL/P999+3uH7RHyf2TIlEov+cEwmFfHcyhZ+e6oxUKn5nJBKJ/h2kehIWDG/Ns+uuIIFGiShuBVgLhrdGqne3cOv32bNnD2VlZTz11FNNeqDGjBnDqlWrmDFjhm7bokWLsLGxwcHBgTfeeANbW9tGWe4MDAyYNWsWn3/+Ofr6+sycOZMuXbr85nwpADMzM+bOnctLL72ERqOhR48eVFRUcPbsWczNzXniiSeYMWMGS5cu5ZVXXmHatGlERkb+5vpPe/bsYdOmTTz66KP4+fkhCAK7d+9m3759rF69GtAme/jiiy949NFHmT9/PhYWFly4cIFOnTrh7+9/3+1siQULFtCvXz9atWrFo48+ikqlYt++fcybNw8/Pz8mTpzI5MmTWbp0KWFhYRQVFXH06FFCQkIYOnQoL774IoMHD8bPz4+ysjKOHz/eoqBV9OcRP2WIRKL/HEu5AQ4WMjGQEolE/zqDgpz4elI7HC0aD+VztJDx9aR2D2SdqVWrVtG/f/9mh/KNGTOGiIiIRnNxPvjgA1544QXat29Pfn4+u3fvxtDQULdfLpczb948HnvsMbp3746pqSmbN29ucXveffdd3nrrLZYsWUJgYCCDBg1i7969uhTm7u7u/Pzzz+zcuZPQ0FC++eYbFi9efM9ztm7dGrlczssvv0zbtm3p0qULW7ZsYeXKlTz++OOAdgjgsWPHqK6uJjw8nPbt2/P999/fdQ7Vb7WzJXr37s3WrVvZtWsXbdu2pW/fvo163VavXs3kyZN5+eWX8ff35+GHH+by5cu4u7sDoFaref7553X1+/n58dVXX7W4ftEfJxHulh9S1KzKykosLCyoqKjA3Nz8726OSCQSiUSiFvgr3r/r6+tJS0vDy8sLmeyPzWtSawQupZVSWFWPvZl2aN+f3SN1v06cOEGfPn0oKyvD0tKy2TJr1qzhxRdfpLy8/C9tm0j0Z7qfv2VxmJ9IJGqkvLaBA9fyebST+z3L7YrOwc1KzrcnU5j
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"wos_plot = wos_nlp.merge(tnse_data, on=record_col)\n",
"\n",
"g = sns.scatterplot(wos_plot[wos_plot[\"Domain_English\"] != 'article-level classification'], x=\"TNSE-X\", y=\"TNSE-Y\",\n",
" hue='Domain_English', s=1)\n",
"g.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\n",
"wos_plot.head()\n",
"wos_nlp = wos_plot[[record_col, \"Document\", \"keyword_all\", \"TNSE-X\", \"TNSE-Y\"]]\n"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 40,
"outputs": [],
"source": [
"\n",
"wos_nlp.to_excel(f\"{outdir}/wos_nlp.xlsx\", index=False)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 93,
"outputs": [],
"source": [
"wos_nlp.to_csv(f\"{outdir}/wos_nlp.csv\", index=False, sep='\\t')"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 37,
"outputs": [
{
"data": {
"text/plain": "Index(['UT (Unique WOS ID)', 'Document', 'keyword_all', 'TNSE-X', 'TNSE-Y'], dtype='object')"
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos_nlp.columns"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 94,
"outputs": [
{
"data": {
"text/plain": "<Axes: ylabel='Frequency'>"
},
"execution_count": 94,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAlQAAAGdCAYAAADUl+3IAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA1aklEQVR4nO3deXRU9f3/8Ve2ScIyCVsSUgKkBYUIggQIU9B++ZISNLYi2AIipIjygwaERNmqgrV+BfGAQFlSa2voqZTlHKFKSjANm0pkCSCLEqmiQcMkWEgGoiQhc39/9Jv7ZQgq5I5MlufjnHuOcz/v3Hm/Rzt59ebOHT/DMAwBAACgzvx93QAAAEBDR6ACAACwiEAFAABgEYEKAADAIgIVAACARQQqAAAAiwhUAAAAFhGoAAAALAr0dQONhdvtVlFRkVq2bCk/Pz9ftwMAAK6DYRi6cOGCoqOj5e9f9/NMBCovKSoqUkxMjK/bAAAAdXD69Gl16NChzj9PoPKSli1bSvrPvxC73e7jbgAAwPVwuVyKiYkxf4/XFYHKS2r+zGe32wlUAAA0MFYv1+GidAAAAIsIVAAAABYRqAAAACwiUAEAAFhEoAIAALCIQAUAAGARgQoAAMAiAhUAAIBFBCoAAACLCFQAAAAWEagAAAAsIlABAABYRKACAACwiEAFAABgUaCvG8D16Twny9ct3LBPFyb7ugUAAG4KzlABAABYRKACAACwiEAFAABgEYEKAADAIgIVAACART4PVF988YUeeughtWnTRqGhoerZs6cOHDhgrhuGoXnz5ql9+/YKDQ1VYmKiTp486XGMc+fOaezYsbLb7QoPD9fEiRN18eJFj5ojR47ozjvvVEhIiGJiYrRo0aJavWzcuFHdunVTSEiIevbsqX/84x/fz9AAAKBR8WmgOn/+vAYOHKigoCBt3bpVH3zwgRYvXqxWrVqZNYsWLdLy5cuVkZGhvXv3qnnz5kpKStKlS5fMmrFjx+r48ePKycnRli1btHv3bk2aNMlcd7lcGjp0qDp16qT8/Hy9+OKLeuaZZ/Tyyy+bNXv27NGYMWM0ceJEHTp0SMOHD9fw4cN17Nixm/NiAACABsvPMAzDV08+Z84cvfvuu3r77bevuW4YhqKjo/X444/riSeekCSVlZUpMjJSmZmZGj16tD788EPFxcVp//796tu3ryQpOztb99xzjz7//HNFR0dr9erVevLJJ+V0OmWz2czn3rx5s06cOCFJGjVqlMrLy7Vlyxbz+QcMGKDevXsrIyPjO2dxuVwKCwtTWVmZ7Ha7pdflWrgPFQAA3uet398+PUP1xhtvqG/fvvrFL36hiIgI3XHHHfrjH/9orp86dUpOp1OJiYnmvrCwMCUkJCgvL0+SlJeXp/DwcDNMSVJiYqL8/f21d+9es+auu+4yw5QkJSUlqaCgQOfPnzdrrnyempqa57laRUWFXC6XxwYAAJomnwaqTz75RKtXr1bXrl21bds2TZkyRY899pjWrFkjSXI6nZKkyMhIj5+LjIw015xOpyIiIjzWAwMD1bp1a4+aax3jyuf4ppqa9astWLBAYWFh5hYTE3PD8wMAgMbBp4HK7XarT58+ev7553XHHXdo0qRJevTRR6/rT2y+NnfuXJWVlZnb6dOnfd0SAADwEZ8Gqvbt2ysuLs5jX/fu3VVYWChJioqKkiQVFxd71BQXF5trUVFRKikp8Vi/fPmyzp0751FzrWNc+RzfVFOzfrXg4GDZ7XaPDQAANE0+DVQDBw5UQUGBx76PPvpInTp1kiTFxsYqKipKubm55rrL5dLevXvlcDgkSQ6HQ6WlpcrPzzdrtm/fLrfbrYSEBLNm9+7dqqqqMmtycnJ06623mp8odDgcHs9TU1PzPAAAAN/Ep4EqLS1N7733np5//nn961//0tq1a/Xyyy8rNTVVkuTn56cZM2boueee0xtvvKGjR49q/Pjxio6O1vDhwyX954zWsGHD9Oijj2rfvn169913NXXqVI0ePVrR0dGSpAcffFA2m00TJ07U8ePHtX79ei1btkzp6elmL9OnT1d2drYWL16sEydO6JlnntGBAwc0derUm/66AACAhiXQl0/er18/bdq0SXPnztWzzz6r2NhYLV26VGPHjjVrZs2apfLyck2aNEmlpaUaNGiQsrOzFRISYta89tprmjp1qoYMGSJ/f3+NHDlSy5cvN9fDwsL01ltvKTU1VfHx8Wrbtq3mzZvnca+qH//4x1q7dq2eeuop/eY3v1HXrl21efNm9ejR4+a8GAAAoMHy6X2oGhPuQ1Ub96ECANR3jeI+VAAAAI0BgQoAAMAiAhUAAIBFBCoAAACLCFQAAAAWEagAAAAsIlABAABYRKACAACwiEAFAABgEYEKAADAIgIVAACARQQqAAAAiwhUAAAAFhGoAAAALCJQAQAAWESgAgAAsIhABQAAYBGBCgAAwCICFQAAgEUEKgAAAIsIVAAAABYRqAAAACwiUAEAAFhEoAIAALCIQAUAAGARgQoAAMAiAhUAAIBFBCoAAACLCFQAAAAWEagAAAAsIlABAABYRKACAACwiEAFAABgEYEKAADAIgIVAACARQQqAAAAiwhUAAAAFhGoAAAALCJQAQAAWESgAgAAsIhABQAAYBGBCgAAwCICFQAAgEUEKgAAAIsIVAAAABYRqAAAACzyaaB65pln5Ofn57F169bNXL906ZJSU1PVpk0btWjRQiNHjlRxcbHHMQoLC5WcnKxmzZopIiJCM2fO1OXLlz1qdu7cqT59+ig4OFhdunRRZmZmrV5Wrlypzp07KyQkRAkJCdq3b9/3MjMAAGh8fH6G6rbbbtOZM2fM7Z133jHX0tLS9Oabb2rjxo3atWuXioqKNGLECHO9urpaycnJqqys1J49e7RmzRplZmZq3rx5Zs2pU6eUnJyswYMH6/Dhw5oxY4YeeeQRbdu2zaxZv3690tPTNX/+fB08eFC9evVSUlKSSkpKbs6LAAAAGjQ/wzAMXz35M888o82bN+vw4cO11srKytSuXTutXbtWDzzwgCTpxIkT6t69u/Ly8jRgwABt3bpV9957r4qKihQZGSlJysjI0OzZs3X27FnZbDbNnj1bWVlZOnbsmHns0aNHq7S0VNnZ2ZKkhIQE9evXTytWrJAkud1uxcTEaNq0aZozZ851zeJyuRQWFqaysjLZ7XYrL8s1dZ6T5fVjft8+XZjs6xYAAPhW3vr97fMzVCdPnlR0dLR++MMfauzYsSosLJQk5efnq6qqSomJiWZtt27d1LFjR+Xl5UmS8vLy1LNnTzNMSVJSUpJcLpeOHz9u1lx5jJqammNUVlYqPz/fo8bf31+JiYlmzbVUVFTI5XJ5bAAAoGnyaaBKSEhQZmamsrOztXr1ap06dUp33nmnLly4IKfTKZvNpvDwcI+fiYyMlNPplCQ5nU6PMFWzXrP2bTUul0tff/21vvzyS1VXV1+zpuYY17JgwQKFhYWZW0xMTJ1eAwAA0PAF+vLJ7777bvOfb7/9diUkJKhTp07asGGDQkNDfdjZd5s7d67S09PNxy6Xi1AFAEAT5fM/+V0pPDxct9xyi/71r38pKipKlZWVKi0t9agpLi5WVFSUJCkqKqrWp/5qHn9Xjd1uV2hoqNq2bauAgIBr1tQc41qCg4Nlt9s9NgAA0DTVq0B18eJFffzxx2rfvr3i4+MVFBSk3Nxcc72goECFhYVyOBySJIfDoaNHj3p8Gi8nJ0d2u11xcXFmzZXHqKmpOYbNZlN8fLxHjdvtVm5urlkDAADwbXwaqJ544gnt2rVLn376qfbs2aP7779fAQEBGjNmjMLCwjRx4kSlp6drx44dys/P14QJE+RwODRgwABJ0tChQxUXF6dx48bp/fff17Zt2/TUU08pNTVVwcHBkqTJkyfrk08+0axZs3TixAmtWrVKGzZsUFpamtlHenq6/vjHP2rNmjX68MMPNWXKFJWXl2vChAk+eV0AAEDD4tNrqD7//HONGTNG//73v9WuXTsNGjRI7733ntq1aydJeumll+Tv76+RI0eqoqJCSUlJWrVqlfn
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import spacy\n",
"\n",
"nlp = spacy.load(\"en_core_web_lg\")\n",
"kwd_nlp = pd.DataFrame(kw_df[\"keyword_all\"].drop_duplicates())\n",
"# wos_kwd_test[\"BERT_KWDS\"] = wos_kwd_test[\"Document\"].map(kwd_extract)\n",
"\n",
"vectors = list()\n",
"vector_norms = list()\n",
"\n",
"for doc in nlp.pipe(kwd_nlp['keyword_all'].astype('unicode').values, batch_size=300,\n",
" n_process=4):\n",
" vectors.append(doc.vector)\n",
" vector_norms.append(doc.vector_norm)\n",
"\n",
"kwd_nlp['vector'] = vectors\n",
"kwd_nlp['vector_norm'] = vector_norms\n",
"kwd_nlp['vector_norm'].plot(kind=\"hist\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 95,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) TNSE-X TNSE-Y\n0 COMPARATIVE GENOMICS -114.811630 -43.915569\n1 ANAMMOX 8.044455 100.761032\n2 KUENENIA STUTTGARTIENSIS 8.044455 100.761032\n3 METAGENOMICS 8.044455 100.761032\n4 ENRICHMENT CULTURE -99.356590 -78.270439",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>TNSE-X</th>\n <th>TNSE-Y</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>COMPARATIVE GENOMICS</td>\n <td>-114.811630</td>\n <td>-43.915569</td>\n </tr>\n <tr>\n <th>1</th>\n <td>ANAMMOX</td>\n <td>8.044455</td>\n <td>100.761032</td>\n </tr>\n <tr>\n <th>2</th>\n <td>KUENENIA STUTTGARTIENSIS</td>\n <td>8.044455</td>\n <td>100.761032</td>\n </tr>\n <tr>\n <th>3</th>\n <td>METAGENOMICS</td>\n <td>8.044455</td>\n <td>100.761032</td>\n </tr>\n <tr>\n <th>4</th>\n <td>ENRICHMENT CULTURE</td>\n <td>-99.356590</td>\n <td>-78.270439</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 95,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.manifold import TSNE\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"# % matplotlib inline\n",
"\n",
"vector_data = pd.DataFrame(kwd_nlp[\"vector\"].to_list(), index=kwd_nlp[\"keyword_all\"]).reset_index()\n",
"vector_data.head()\n",
"\n",
"labels = vector_data.values[:, 0]\n",
"record_vectors = vector_data.values[:, 1:]\n",
"\n",
"tsne_model = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, random_state=42, metric='cosine')\n",
"tnse_2d = tsne_model.fit_transform(record_vectors)\n",
"tnse_data = pd.DataFrame(tnse_2d, index=labels).reset_index()\n",
"tnse_data.columns = [record_col, \"TNSE-X\", \"TNSE-Y\"]\n",
"tnse_data.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 96,
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.\n"
]
},
{
"data": {
"text/plain": "<matplotlib.legend.Legend at 0x1b1f8532370>"
},
"execution_count": 96,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAmsAAAGwCAYAAAD2XSKVAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOy9fXRU9b3v/yIkM3mcDJOnScJkIAwhIRAQQiFIxEjUUrGt0B7UeuSWU6pGxXps69V1df1++lva2uNCUVM89NLitSKnF2iVnlRNCBghUBKBgUiYDCHJkMlk8sBk8jiTkPz+2LM3eYQAQQh8X2uxgMzOzN6z98z3vT8P78+Evr6+PgQCgUAgEAgENyR+13sHBAKBQCAQCAQjI8SaQCAQCAQCwQ2MEGsCgUAgEAgENzBCrAkEAoFAIBDcwAixJhAIBAKBQHADI8SaQCAQCAQCwQ2MEGsCgUAgEAgENzD+13sHxhu9vb3Y7XbCwsKYMGHC9d4dgUAgEAgEo6Cvr4/W1lbi4uLw87v2sarz58/T3d097GMBAQFMnDhx1M8lxNplYrfbMRgM13s3BAKBQCAQXAE2m43Jkydfs+fv6+vD4XDgcrkuup1Wq0Wv148q8CPE2mUSFhYGSCdbo9Fc570RCAQCgUAwGtxuNwaDQVnHrxWyUIuOjiY4OHiIGOvr66OjowOn0wlAbGzsJZ9TiLXLRH7TNRqNEGsCgUAgEIwzrmUJ0/nz5xWhFhERMeJ2QUFBADidTqKjoy+ZEhUNBgKBQCAQCARjgFyjFhwcfMlt5W1GqmvrjxBrAoFAIBAIBGPIaKJ3lxPhE2JNIBAIBAKB4AZGiDWBQCAQCASCGxgh1gQCgUAgEAhuYIRYEwgEAoFAIBhD+vr6xmQbGSHWBAKBQCAQCMaAgIAAADo6Oi65rbyN/DsXQ/isCQQCgUAgEIwBEydORKvVKoa3lzLF1Wq1oxo7JcSaQCAQCAQCwRih1+sBFME2EvK4qdEgxJpAIBAIBALBGDFhwgRiY2OJjo4Wg9wFAoFAIBAIblQmTpx4WYLsYogGA4FAIBAIBIIbGCHWBAKBQCAQCG5ghFgTCATjFrur83rvgkAgEFxzhFgTCATXhasVWnZXJ0t/V3hNBZsQgwKB4EZAiDWBQPCtMxZCK04bxL5fZRGnDRr1a14O34YYFAgEgtEgxJpAIPjWuVyhdbHnGQ1XIrzGah8FAoHgahFiTSAQXBe+TRF0pcJLCDWBQHAjIMSaQCC4JlxN+vBapB6F8BIIBOMVIdYEAsGouBwBdTX1XqJWTCAQCAYixJpAILgklyugrqbea7S/K8ScQCC4VRBiTSAQXJIrEV9Xk3YcjVAT0TeBQHCrIMSaQCAYFTdSzZfo1BQIBLcSQqwJBLcYN0s0Sgg1gUBwqyDEmkBwC3Ejpg9vpH0RCASCGxEh1gSCW4ixSh+OVmBdarsbUTwKBALBjYYQawLBLcZYCLXRCKzRbCdqzwQCgeDSTOjr6+u73jsxnnC73YSHh9PS0oJGo7neuyMQjDl2V+eoujFHI7BGu53g+iHOkeBWYTyv3yKyJhAIFEYbNRvt4i5EwI2NSEMLBOMDEVm7TMazMhcIRoOItNxaiPMtuFUYz+u3iKwJBIIBiIX71kKcb4HgxkeINYFgHDNWXZnfxj4IBAKB4MoQYk0gGKeMZVfmtd4HgUAgEFw5ombtMhnPOW/Bzce17socy85QgUAguJ6M5/VbRNYEgnHMtezKHOvOUIFAIBBcGeNGrP3+978nLS0NjUaDRqMhIyODvLw85fGuri6efPJJIiIiCA0NZdWqVdTX1w94jpqaGu677z6Cg4OJjo7mV7/6FT09Pd/2oQgE4wJhWCsQCAQ3BuNGrE2ePJnf/OY3lJaWUlJSwl133cUPfvADysrKAHj22Wf59NNP+ctf/sK+ffuw2+2sXLlS+f3z589z33334fV6OXDgAFu3buVPf/oTL7/88vU6JIHguiOiZgKBQHDjM65r1nQ6Hb/73e/40Y9+RFRUFB999BE/+tGPACgvLyclJYXi4mIWLVpEXl4eK1aswG63ExMTA8CmTZt4/vnnaWhoQKVSjeo1x3POWzA+uVY1YXKaU0TPBALBrcB4Xr/HTWStP+fPn+fjjz+mvb2djIwMSktL6e7uJjs7W9kmOTmZhIQEiouLASguLmb27NmKUAO49957cbvdSnRuODweD263e8AfgeDb4mq6LUcTNRNCTSAQCG58xpVYO378OKGhoajVah5//HF27drFzJkzcTgcqFQqtFrtgO1jYmJwOBwAOByOAUJNflx+bCRef/11wsPDlT8Gg2FsD0oguAhXKqhEc4BAIBDcPIwrsTZjxgyOHj3KoUOHeOKJJ1izZg3ffPPNNX3NF154gZaWFuWPzWa7pq8nEAzmSgSViJoJBALBzYP/9d6By0GlUmEymQCYP38+hw8f5u2332b16tV4vV5cLteA6Fp9fT16vR4AvV7PP//5zwHPJ3eLytsMh1qtRq1Wj/GRCATXHiHUBAKB4OZgXEXWBtPb24vH42H+/PkEBARQUFCgPHbq1ClqamrIyMgAICMjg+PHj+N0OpVtvvjiCzQaDTNnzvzW910gkBHu/wKBQCC4GOMmsvbCCy+wfPlyEhISaG1t5aOPPmLv3r189tlnhIeH82//9m/8+7//OzqdDo1Gw9NPP01GRgaLFi0C4J577mHmzJn867/+K2+88QYOh4P/9b/+F08++aSInAmuG1fTkSkmBwgEAsGtwbgRa06nk0cffZS6ujrCw8NJS0vjs88+4+677wZgw4YN+Pn5sWrVKjweD/feey+5ubnK70+cOJHdu3fzxBNPkJGRQUhICGvWrOGVV165XockuMm4EvF0tQ0Eoi5NIBAIbn7Gtc/a9WA8+7QIrh3XQzzd6JG1G33/BALBrcV4Xr/Hdc2aQHCjcD26L29kIXQ1/nACgUAgGIgQawLBGHEji6dvG2EdIhAIBGOHEGsCgeCaIISaQCAQjA1CrAkEAoFAIBDcwAixJhAIBKNgQ77leu+CQCC4RRFiTSC4TETR/K3B5r0VABRZnGzIt5BbaBWCTSAQXBfGjc+aQHAjIPzNbg02763gjS8qaO/pw9vdzeavqsnJMpEcHXq9d00gENyCiMiaQMDoo2Wiy/HmJc9sV/69cFoUjyw00NLhYfNX1WSnRAHwWt5Jth2suk57KBAIblWEWBPc8lyuJ5gQajcfeWY72w7XKILtQIUTjcqPDw/ZeGShgfyTDdS3dABwrNaF2ea6jnsrEAhuNYRYE9zyiGiZ4ExDG8WVzZx2tvFuvoV/fFNPbpEUUXO4u0iNDcPu6mJqZAg7vrZTVuu63rssEAhuIUTNmkCAiJbdiOQWWPD0wbPZSdf8tXKWJXG+D/5Z3cyZxnbuTY1BExRAu+c8xZXN5GQayS2q5vFMI4um6Hho0ZRrvk8CgUAgIyJrAoHghiO3wMKGPVYsjpZr3oH5Xr6FPLOdli4vxZXNRISoKKk6B4C7s5vslCglyvbXYw4OVjVTZHFe030SCASC/gixJhAIbjimRoWyal4c+ScbiAi8dgmADfkWPi61sX77UT4rq+eJTCNN7V7K6lo509jO7Mnh1Lm6eDzTyIlaNz9ZYKC4shmAjfkWYeMiEAi+FUQaVCAQ3HAsT4sDYGaMhkeXJF6T19h2sIrcQiuPLDSgC1bzjaOF3/tSnUXWZpaadOQWVbMmI4GG9m4cbg8AW9akU3y6gU/NDs51eFh3h0mk0QUCwTVFiDWB4DKxuzrF4vwtIAu2scZsc5Fm0Cp1Z6nxWiLD1PSWQHhQAJuKqtFr1BypbSUjUUetq5P8kw2szzLx+F0X6uc2f1XNXckx4loQCATXHJEGFQgug8u1+RDcWJhtLj76Z5VSB/fQoilEhqn53Wcn+b+lNnZ8bScjUcfUyBCKK5tJN+rIP9lARqIOZ1uXYtmRmRTNjscXk5kUfR2PRiAQ3CqIyJpAwOijZcLmY3xz6HQDO762kxobxgTgF9lJxGmDuC1eS+a0SDw9vXzjcLP3VAN6jZpQlR+/vns6X5918fH
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"g = sns.scatterplot(tnse_data, x=\"TNSE-X\", y=\"TNSE-Y\", s=1)\n",
"g.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 99,
"outputs": [],
"source": [
"wos_nlp.to_csv(f\"{outdir}/wos_nlp.csv\", index=False, sep='\\t')\n",
"tnse_data.to_csv(f\"{outdir}/kw_nlp.csv\", index=False, sep='\\t')\n",
"\n",
"wos_nlp.to_excel(f\"{outdir}/wos_nlp.xlsx\", index=False)\n",
"tnse_data.drop_duplicates(subset=record_col).to_excel(f\"{outdir}/kw_nlp.xlsx\", index=False)"
],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
}
},
"nbformat": 4,
"nbformat_minor": 1
}