You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
blabla/WOS/wos_processing.ipynb

2773 lines
666 KiB
Plaintext

2 years ago
{
"cells": [
{
"cell_type": "code",
"execution_count": 35,
2 years ago
"outputs": [],
"source": [
2 years ago
"import numpy as np\n",
2 years ago
"import pandas as pd\n",
"import os\n",
1 year ago
"import shutil\n",
"from flashgeotext.geotext import GeoText\n",
"import re\n",
"import spacy"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 20,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"I like salty fries and hamburgers. <-> Fast food tastes very good. 0.691649353055761\n",
"salty fries <-> hamburgers 0.6938489675521851\n"
]
}
],
"source": [
"import spacy\n",
"\n",
"nlp = spacy.load(\"en_core_web_md\") # make sure to use larger package!\n",
"doc1 = nlp(\"I like salty fries and hamburgers.\")\n",
"doc2 = nlp(\"Fast food tastes very good.\")\n",
"\n",
"# Similarity of two documents\n",
"print(doc1, \"<->\", doc2, doc1.similarity(doc2))\n",
"# Similarity of tokens and spans\n",
"french_fries = doc1[2:4]\n",
"burgers = doc1[5]\n",
"print(french_fries, \"<->\", burgers, french_fries.similarity(burgers))"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 21,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"I\n",
"salty fry\n",
"hamburger\n"
]
},
{
"data": {
"text/plain": "[None, None, None]"
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[print(i.lemma_) for i in doc1.noun_chunks]"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 22,
"outputs": [],
"source": [
"doc_test = nlp(\"On the inevitability of neural networks and other tasty topics of the 21st century\")"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
2 years ago
},
{
"cell_type": "code",
"execution_count": 23,
"outputs": [
{
"data": {
"text/plain": "['the inevitability',\n 'neural network',\n 'other tasty topic',\n 'the 21st century']"
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[i.lemma_ for i in doc_test.noun_chunks]"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 24,
"outputs": [
{
"data": {
"text/plain": "(300,)"
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"doc1.vector.shape"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 25,
"outputs": [
{
"data": {
"text/plain": "\"tokens = []\\nlemma = []\\npos = []\\n\\nfor doc in nlp.pipe(df['species'].astype('unicode').values, batch_size=50,\\n n_threads=3):\\n if doc.is_parsed:\\n tokens.append([n.text for n in doc])\\n lemma.append([n.lemma_ for n in doc])\\n pos.append([n.pos_ for n in doc])\\n else:\\n # We want to make sure that the lists of parsed results have the\\n # same number of entries of the original Dataframe, so add some blanks in case the parse fails\\n tokens.append(None)\\n lemma.append(None)\\n pos.append(None)\\n\\ndf['species_tokens'] = tokens\\ndf['species_lemma'] = lemma\\ndf['species_pos'] = pos\""
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#spacy pipe example\n",
"\"\"\"tokens = []\n",
"lemma = []\n",
"pos = []\n",
"\n",
"for doc in nlp.pipe(df['species'].astype('unicode').values, batch_size=50,\n",
" n_threads=3):\n",
" if doc.is_parsed:\n",
" tokens.append([n.text for n in doc])\n",
" lemma.append([n.lemma_ for n in doc])\n",
" pos.append([n.pos_ for n in doc])\n",
" else:\n",
" # We want to make sure that the lists of parsed results have the\n",
" # same number of entries of the original Dataframe, so add some blanks in case the parse fails\n",
" tokens.append(None)\n",
" lemma.append(None)\n",
" pos.append(None)\n",
"\n",
"df['species_tokens'] = tokens\n",
"df['species_lemma'] = lemma\n",
"df['species_pos'] = pos\"\"\""
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 26,
2 years ago
"metadata": {},
"outputs": [],
2 years ago
"source": [
"workdir_path=r\"wos_extract\"\n",
"outfile='wos_extract_complete.csv'\n",
2 years ago
"# with_header=True\n",
"# for root, dirs, files in os.walk(workdir_path):\n",
"# for filename in files:\n",
"# if filename.startswith(\"wosexport\"):\n",
"# path=os.path.join(root, filename)\n",
"# print(path)\n",
"# chunk = pd.read_excel(path)\n",
"# chunk.to_csv(outfile, mode=\"a\", index=False, header=with_header, sep=\"\\t\")\n",
"# with_header = False"
]
2 years ago
},
{
"cell_type": "code",
"execution_count": 27,
2 years ago
"metadata": {},
2 years ago
"outputs": [],
"source": [
2 years ago
"record_col=\"UT (Unique WOS ID)\""
]
2 years ago
},
{
"cell_type": "code",
"execution_count": 28,
2 years ago
"metadata": {},
2 years ago
"outputs": [],
"source": [
2 years ago
"wos = pd.read_csv(outfile, sep=\"\\t\",low_memory=False)\n",
"metrix = pd.read_excel(\"sm_journal_classification.xlsx\", sheet_name=\"Journal_Classification\")\n",
"\n",
"\n",
"metrix = metrix.set_index([c for c in metrix.columns if \"issn\" not in c]).stack().reset_index()\n",
"metrix = metrix.rename(columns={'level_6':\"issn_type\", 0:\"issn\"})\n",
"metrix[\"issn\"]=metrix[\"issn\"].str.replace(\"-\",\"\").str.lower().str.strip()\n",
"\n",
"wos[\"issn\"] = wos[\"ISSN\"].str.replace(\"-\",\"\").str.lower().str.strip()\n",
"wos[\"eissn\"] = wos[\"eISSN\"].str.replace(\"-\",\"\").str.lower().str.strip()\n",
"wos = wos.set_index([c for c in wos.columns if \"issn\" not in c]).stack().reset_index()\n",
"wos = wos.rename(columns={'level_72':\"issn_var\", 0:\"issn\"})\n",
"\n",
"wos_merge = wos.merge(metrix, on=\"issn\", how=\"left\")\n",
"wos = wos_merge.sort_values(by=\"issn_var\",ascending=False).drop_duplicates(subset=record_col)"
]
2 years ago
},
{
"cell_type": "code",
"execution_count": 29,
2 years ago
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "0 Publication Type\n1 Authors\n2 Book Authors\n3 Book Editors\n4 Book Group Authors\n ... \n76 SubField_English\n77 2.00 SEQ\n78 Source_title\n79 srcid\n80 issn_type\nLength: 81, dtype: object"
},
"execution_count": 29,
2 years ago
"metadata": {},
"output_type": "execute_result"
}
2 years ago
],
2 years ago
"source": [
"pd.Series(wos.columns)"
]
2 years ago
},
{
"cell_type": "code",
"execution_count": 30,
2 years ago
"metadata": {},
2 years ago
"outputs": [
{
"data": {
2 years ago
"text/plain": "0 Salucci, Marco/S-8654-2016; Arrebola, Manuel/L...\n9714 Huang, Yu/AAY-5464-2020\n9697 Kakavand, Mohammad Reza Azadi/X-9556-2019; Fen...\n9699 Dong, Sheng/AAE-3619-2021; Soares, Carlos Gued...\n9701 Han, Guoqi/T-7365-2019; Nan, Yang/HKD-9687-202...\n ... \n3066 ; Liotta, Antonio/G-9532-2014\n5097 , 卢帅/AAK-2185-2020; Popp, József/AFN-1250-2022\n11369 NaN\n11368 Rossiter, D G/D-3842-2009\n11362 Jin, Shuanggen/B-8094-2008\nName: Researcher Ids, Length: 9889, dtype: object"
2 years ago
},
"execution_count": 30,
2 years ago
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2 years ago
"wos[\"Researcher Ids\"]"
]
},
{
"cell_type": "code",
"execution_count": 31,
2 years ago
"metadata": {},
"outputs": [
{
"data": {
"text/plain": " Publication Type Authors \n16979 J Zhang, MS; Huang, J; Cao, Y; Xiong, CH; Mohamm... \\\n1880 J Zhang, MS; Huang, J; Cao, Y; Xiong, CH; Mohamm... \n\n Book Authors Book Editors Book Group Authors \n16979 NaN NaN NaN \\\n1880 NaN NaN NaN \n\n Author Full Names \n16979 Zhang, Mengshi; Huang, Jian; Cao, Yu; Xiong, C... \\\n1880 Zhang, Mengshi; Huang, Jian; Cao, Yu; Xiong, C... \n\n Book Author Full Names Group Authors \n16979 NaN NaN \\\n1880 NaN NaN \n\n Article Title \n16979 Echo State Network-Enhanced Super-Twisting Con... \\\n1880 Echo State Network-Enhanced Super-Twisting Con... \n\n Source Title ... Web of Science Record \n16979 IEEE-ASME TRANSACTIONS ON MECHATRONICS ... 0 \\\n1880 IEEE-ASME TRANSACTIONS ON MECHATRONICS ... 0 \n\n issn_var issn Domain_English Field_English \n16979 issn 10834435 Applied Sciences Engineering \\\n1880 issn 10834435 Applied Sciences Engineering \n\n SubField_English 2.00 SEQ \n16979 Industrial Engineering & Automation 27 \\\n1880 Industrial Engineering & Automation 27 \n\n Source_title srcid issn_type \n16979 IEEE/ASME Transactions on Mechatronics 19113.0 issn1 \n1880 IEEE/ASME Transactions on Mechatronics 19113.0 issn1 \n\n[2 rows x 81 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Publication Type</th>\n <th>Authors</th>\n <th>Book Authors</th>\n <th>Book Editors</th>\n <th>Book Group Authors</th>\n <th>Author Full Names</th>\n <th>Book Author Full Names</th>\n <th>Group Authors</th>\n <th>Article Title</th>\n <th>Source Title</th>\n <th>...</th>\n <th>Web of Science Record</th>\n <th>issn_var</th>\n <th>issn</th>\n <th>Domain_English</th>\n <th>Field_English</th>\n <th>SubField_English</th>\n <th>2.00 SEQ</th>\n <th>Source_title</th>\n <th>srcid</th>\n <th>issn_type</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>16979</th>\n <td>J</td>\n <td>Zhang, MS; Huang, J; Cao, Y; Xiong, CH; Mohamm...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Zhang, Mengshi; Huang, Jian; Cao, Yu; Xiong, C...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Echo State Network-Enhanced Super-Twisting Con...</td>\n <td>IEEE-ASME TRANSACTIONS ON MECHATRONICS</td>\n <td>...</td>\n <td>0</td>\n <td>issn</td>\n <td>10834435</td>\n <td>Applied Sciences</td>\n <td>Engineering</td>\n <td>Industrial Engineering &amp; Automation</td>\n <td>27</td>\n <td>IEEE/ASME Transactions on Mechatronics</td>\n <td>19113.0</td>\n <td>issn1</td>\n </tr>\n <tr>\n <th>1880</th>\n <td>J</td>\n <td>Zhang, MS; Huang, J; Cao, Y; Xiong, CH; Mohamm...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Zhang, Mengshi; Huang, Jian; Cao, Yu; Xiong, C...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Echo State Network-Enhanced Super-Twisting Con...</td>\n <td>IEEE-ASME TRANSACTIONS ON MECHATRONICS</td>\n <td>...</td>\n <td>0</td>\n <td>issn</td>\n <td>10834435</td>\n <td>Applied Sciences</td>\n <td>Engineering</td>\n <td>Industrial Engineering &amp; Automation</td>\n <td>27</td>\n <td>IEEE/ASME Transactions on Mechatronics</td>\n <td>19113.0</td>\n <td>issn1</td>\n </tr>\n </tbody>\n</table>\n<p>2 rows × 81 columns</p>\n</div>"
},
"execution_count": 31,
2 years ago
"metadata": {},
"output_type": "execute_result"
}
2 years ago
],
2 years ago
"source": [
"wos[(~wos[\"DOI\"].isna())&(wos[\"DOI\"].duplicated(False))]"
]
2 years ago
},
1 year ago
{
"cell_type": "code",
"execution_count": 32,
1 year ago
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Keywords Plus \n0 WOS:000852293800024 CONVOLUTIONAL NEURAL-NETWORK; DEEP LEARNING FR... \\\n9714 WOS:000540750000002 STATE-SPACE RECONSTRUCTION; SURFACE AIR-TEMPER... \n9697 WOS:000600708400002 COMPRESSIVE STRENGTH; MODELS; ADABOOST.RT; DUC... \n9699 WOS:000511965100005 STRUCTURAL RELIABILITY; FAILURE MODES \n9701 WOS:000663142500003 REFLECTED GPS SIGNALS; SOIL-MOISTURE; OCEAN; S... \n... ... ... \n3066 WOS:000528727500074 LOCAL SEARCH; ALGORITHM; VARIANCE; MODEL \n5097 WOS:000596139400001 INDUSTRY 4.0; MANAGEMENT; RISK; ANALYTICS; CHA... \n11369 WOS:000436774300069 NaN \n11368 WOS:000846290700001 PARTIAL LEAST-SQUARES; INFRARED-SPECTROSCOPY; ... \n11362 WOS:000480527800025 MICROWAVE DIELECTRIC BEHAVIOR; GPS SIGNALS; RE... \n\n Author Keywords \n0 Imaging; Three-dimensional displays; Electroma... \\\n9714 NaN \n9697 Plastic hinge length; RC columns; Machine lear... \n9699 system reliability; jacket platform; beta-unzi... \n9701 Cyclone GNSS (CYGNSS); Sea surface wind speed;... \n... ... \n3066 sea surface temperature; sea surface temperatu... \n5097 Big data finance; Big data in financial servic... \n11369 planetary gear; fault diagnosis; VMD; center f... \n11368 soil fertility class; reflectance spectroscopy... \n11362 global navigation satellite system (GNSS)-refl... \n\n Article Title \n0 Artificial Intelligence: New Frontiers in Real... \\\n9714 Detecting causality from time series in a mach... \n9697 Data-Driven Approach to Predict the Plastic Hi... \n9699 System Reliability Analysis of an Offshore Jac... \n9701 Analysis of coastal wind speed retrieval from ... \n... ... \n3066 Improved Particle Swarm Optimization for Sea S... \n5097 Current landscape and influence of big data on... \n11369 Planetary Gear Fault Diagnosis via Feature Ima... \n11368 How Well Can Reflectance Spectroscopy Allocate... \n11362 GNSS-R Soil Moisture Retrieval Based on a XGbo... \n\n Abstract \n0 In recent years, artificial intelligence (AI) ... \n9714 Detecting causality from observational data is... \n9697 Inelastic response of reinforced concrete colu... \n9699 This study investigates strategies for solving... \n9701 This paper demonstrates the capability and per... \n... ... \n3066 The Sea Surface Temperature (SST) is one of th... \n5097 Big data is one of the most recent business an... \n11369 Poor working environment leads to frequent fai... \n11368 Fertilization decisions depend on the measurem... \n11362 Global navigation satellite system (GNSS)-refl... \n\n[9889 rows x 5 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Keywords Plus</th>\n <th>Author Keywords</th>\n <th>Article Title</th>\n <th>Abstract</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000852293800024</td>\n <td>CONVOLUTIONAL NEURAL-NETWORK; DEEP LEARNING FR...</td>\n <td>Imaging; Three-dimensional displays; Electroma...</td>\n <td>Artificial Intelligence: New Frontiers in Real...</td>\n <td>In recent years, artificial intelligence (AI) ...</td>\n </tr>\n <tr>\n <th>9714</th>\n <td>WOS:000540750000002</td>\n <td>STATE-SPACE RECONSTRUCTION; SURFACE AIR-TEMPER...</td>\n <td>NaN</td>\n <td>Detecting causality from time series in a mach...</td>\n <td>Detecting causality from observational data is...</td>\n </tr>\n <tr>\n <th>9697</th>\n <td>WOS:000600708400002</td>\n <td>COMPRESSIVE STRENGTH; MODELS; ADABOOST.RT; DUC...</td>\n <td>Plastic hinge length; RC columns; Machine lear...</td>\n <td>Data-Driven Approach to Predict the Plastic Hi...</td>\n <td>Inelastic response of reinforced concrete colu...</td>\n </tr>\n <tr>\n <th>9699</th>\n <td>WOS:000511965100005</td>\n <td>STRUCTURAL RELIABILITY; FAILURE MODES</td>\n <td>system reliability; jacket platform; beta-unzi...</td>\n <td>System Reliability Analysis of an Offshore Jac...</td>\n <td>This study investigates strategies for solving...</td>\n </tr>\n <tr>\n <th>9701</th>\n <td>WOS:000663142500003</td>\n <td>REFLECTED GPS SIGNALS; SOIL-MOISTURE; OCEAN; S...</td>\n <td>Cyclone GNSS (CYGNSS); Sea surface wind speed;...</td>\n <td>Analysis of coastal wind speed retrieval from ...</td>\n <td>This paper demonstrates the capability and per...</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>3066</th>\n <td>WOS:000528727500074</td>\n <td>LOCAL SEARCH; ALGORITHM; VARIANCE; MODEL</td>\n <td>sea surface temperature; sea surface temperatu...</td>\n <td>Improved Particle Swarm Optimization for Sea S...</td>\n <td>The Sea Surface Temperature (SST) is one of th...</td>\n </tr>\n <tr>\n <th>5097</th>\n <td>WOS:000596139400001</td>\n <td>INDUSTRY 4.0; MANAGEMENT; RISK; ANALYTICS; CHA...</td>\n <td>Big data finance; Big data in financial servic...</td>\n <td>Current landscape and influence of big data on...</td>\n <td>Big data is one of the most recent business an...</td>\n </tr>\n <tr>\n <th>11369</th>\n <td>WOS:000436774300069</td>\n <td>NaN</td>\n <td>planetary gear; fault diagnosis; VMD; center f...</td>\n <td>Planetary Gear Fault Diagnosis via Feature Ima...</td>\n <td>Poor working environment leads to frequent fai...</td>\n </tr>\n <tr>\n <th>11368</th>\n <td>WOS:000846290700001</td>\n <td>PARTIAL LEAST-SQUARES; INFRARED-SPECTROSCOPY; ...</td>\n <td>soil fertility class; reflectance spectroscopy...</td>\n <td>How Well Can Reflectance Spectroscopy Allocate...</td>\n <td>Fertilization decisions depend on the measurem...</td>\n </tr>\n <tr>\n <th>11362</th>\n <td>WOS:000480527800025</td>\n <td>MICROWAVE DIELECTRIC BEHAVIOR; GPS SIGNALS; RE...</td>\n <td>global navigation satellite system (GNSS)-refl...</td>\n <td>GNSS-R Soil Moisture Retrieval Based on a XGbo...</td>\n <td>Global navigation satellite system (GNSS)-refl...</td>\n </tr>\n </tbody>\n</table>\n<p>9889 rows × 5 columns</p>\n</div>"
},
"execution_count": 32,
1 year ago
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos[[record_col,\"Keywords Plus\",\"Author Keywords\",\"Article Title\",\"Abstract\"]]\n"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 68,
1 year ago
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) keyword_all\n1 WOS:000297893800037 ADAPTIVE DYNAMIC SURFACE CONTROL\n2 WOS:000297893800037 NEURAL COMPENSATOR\n3 WOS:000297893800037 BUCK CONVERTER\n4 WOS:000297893800037 FINITE-TIME IDENTIFIER\n5 WOS:000301090100061 TEMPORAL CONJUNCTION\n.. ... ...\n99 WOS:000309409400280 SCIENTIFIC DATA CLOUD\n100 WOS:000309409400280 VIRTUAL DATASPACES\n101 WOS:000309409400280 SEMANTIC INTEGRATION\n102 WOS:000309409400280 ONTOLOGY\n103 WOS:000309409400280 PAY-AS-YOU-GO\n\n[100 rows x 2 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>keyword_all</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>1</th>\n <td>WOS:000297893800037</td>\n <td>ADAPTIVE DYNAMIC SURFACE CONTROL</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000297893800037</td>\n <td>NEURAL COMPENSATOR</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000297893800037</td>\n <td>BUCK CONVERTER</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000297893800037</td>\n <td>FINITE-TIME IDENTIFIER</td>\n </tr>\n <tr>\n <th>5</th>\n <td>WOS:000301090100061</td>\n <td>TEMPORAL CONJUNCTION</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>99</th>\n <td>WOS:000309409400280</td>\n <td>SCIENTIFIC DATA CLOUD</td>\n </tr>\n <tr>\n <th>100</th>\n <td>WOS:000309409400280</td>\n <td>VIRTUAL DATASPACES</td>\n </tr>\n <tr>\n <th>101</th>\n <td>WOS:000309409400280</td>\n <td>SEMANTIC INTEGRATION</td>\n </tr>\n <tr>\n <th>102</th>\n <td>WOS:000309409400280</td>\n <td>ONTOLOGY</td>\n </tr>\n <tr>\n <th>103</th>\n <td>WOS:000309409400280</td>\n <td>PAY-AS-YOU-GO</td>\n </tr>\n </tbody>\n</table>\n<p>100 rows × 2 columns</p>\n</div>"
1 year ago
},
"execution_count": 68,
1 year ago
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kw_df = pd.DataFrame()\n",
"for c in [\"Keywords Plus\",\"Author Keywords\"]:\n",
" kwp = wos.groupby(record_col)[c].apply(lambda x: x.str.split(';')).explode().str.strip().str.upper()\n",
" kwp.name = 'keyword_all'\n",
1 year ago
" kw_df = pd.concat([kwp.reset_index(),kw_df],ignore_index=True)\n",
"kw_df = kw_df[~kw_df[\"keyword_all\"].isna()].copy().drop(columns=\"level_1\").drop_duplicates()\n",
"kw_df[\"keyword_all\"] = kw_df[\"keyword_all\"].apply(lambda x: re.sub(\"[\\(\\[].*?[\\)\\]]\", \"\", x))\n",
"kw_df.head(100)"
1 year ago
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 69,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) keyword_all\n0 WOS:000297893800037 ADAPTIVE DYNAMIC SURFACE CONTROL; NEURAL COMPE...\n1 WOS:000301090100061 TEMPORAL CONJUNCTION; CAUDATE NUCLEUS; PREFRON...\n2 WOS:000301155300013 AUTOMATIC INCIDENT DETECTION; DATA CLEANSING; ...\n3 WOS:000301973200015 TRACHEO-BRONCHIAL; LUNG; INNERVATION; ESOPHAGE...\n4 WOS:000302289400006 LINGUISTIC ANNOTATION; ANNOTATION TOOLS; INTER...",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>keyword_all</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000297893800037</td>\n <td>ADAPTIVE DYNAMIC SURFACE CONTROL; NEURAL COMPE...</td>\n </tr>\n <tr>\n <th>1</th>\n <td>WOS:000301090100061</td>\n <td>TEMPORAL CONJUNCTION; CAUDATE NUCLEUS; PREFRON...</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000301155300013</td>\n <td>AUTOMATIC INCIDENT DETECTION; DATA CLEANSING; ...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000301973200015</td>\n <td>TRACHEO-BRONCHIAL; LUNG; INNERVATION; ESOPHAGE...</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000302289400006</td>\n <td>LINGUISTIC ANNOTATION; ANNOTATION TOOLS; INTER...</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos_kwd_concat = kw_df.groupby(record_col, as_index=False).agg({'keyword_all': '; '.join})\n",
"wos_kwd_concat.head()"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 34,
1 year ago
"outputs": [
{
"data": {
"text/plain": "Downloading pytorch_model.bin: 0%| | 0.00/438M [00:00<?, ?B/s]",
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "0d9a3ff741694ac895a40780392c62fe"
1 year ago
}
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": "Downloading (…)nce_bert_config.json: 0%| | 0.00/53.0 [00:00<?, ?B/s]",
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "ed4c1401e1aa4bfc88bf3a97e178b5e2"
}
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": "Downloading (…)cial_tokens_map.json: 0%| | 0.00/239 [00:00<?, ?B/s]",
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "60046d76b6694b1dbf6f7f22ade78d7d"
}
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": "Downloading (…)a8e1d/tokenizer.json: 0%| | 0.00/466k [00:00<?, ?B/s]",
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "5529ba6b228440cd8d8388bf087e20c0"
}
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": "Downloading (…)okenizer_config.json: 0%| | 0.00/363 [00:00<?, ?B/s]",
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "88cfcf91709d479abe7d302419b5e0a6"
}
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": "Downloading (…)8e1d/train_script.py: 0%| | 0.00/13.1k [00:00<?, ?B/s]",
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "fcd1449c2b5f4447bb26f4c8a323e372"
}
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": "Downloading (…)b20bca8e1d/vocab.txt: 0%| | 0.00/232k [00:00<?, ?B/s]",
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "6762f031a4694013aa75cda0f75c648c"
}
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": "Downloading (…)bca8e1d/modules.json: 0%| | 0.00/349 [00:00<?, ?B/s]",
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "6feda2df252e428b83db52ea36d58ca1"
}
},
"metadata": {},
"output_type": "display_data"
1 year ago
}
],
"source": [
"from keybert import KeyBERT\n",
"\n",
"# Uses stopwords for english from NLTK, and all puntuation characters by\n",
"# default\n",
"kw_model = KeyBERT(model='all-mpnet-base-v2')\n",
"\n",
"# Extraction given the text.\n",
"# r.extract_keywords_from_text(<text to process>)\n",
"\n",
"# keywords = kw_model.extract_keywords(full_text,\n",
"#\n",
"# keyphrase_ngram_range=(1, 3),\n",
"#\n",
"# stop_words='english',\n",
"#\n",
"# highlight=False,\n",
"#\n",
"# top_n=10)\n",
"#\n",
"# keywords_list= list(dict(keywords).keys())"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 78,
1 year ago
"outputs": [
{
"data": {
"text/plain": "'ELECTROMAGNETIC IMAGING; INVERSE SCATTERING; SCATTERING ELECTROMAGNETIC'"
},
"execution_count": 78,
"metadata": {},
"output_type": "execute_result"
1 year ago
}
],
"source": [
"def kwd_extract(text):\n",
" keywords = kw_model.extract_keywords(text,\n",
"\n",
" keyphrase_ngram_range=(1, 2),\n",
"\n",
" stop_words='english',\n",
"\n",
" highlight=False,\n",
"\n",
" top_n=3)\n",
" return \"; \".join([i[0].upper() for i in keywords])\n",
"\n",
"kwd_extract(text=\"Artificial Intelligence: New Frontiers in Real-Time Inverse Scattering and Electromagnetic Imaging - In recent years, artificial intelligence (AI) techniques have been developed rapidly. With the ...\")"
1 year ago
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 61,
"outputs": [
{
"data": {
"text/plain": "'ELECTROMAGNETIC IMAGING; INVERSE SCATTERING; SCATTERING ELECTROMAGNETIC; SCATTERING; AI; ELECTROMAGNETIC; IMAGING; ARTIFICIAL INTELLIGENCE'"
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
1 year ago
"execution_count": 125,
1 year ago
"outputs": [],
"source": [
1 year ago
"# wos_nlp = wos[[record_col,\"Article Title\",\"Abstract\"]]\n",
"wos = wos.merge(wos_kwd_concat, on = record_col)\n",
"wos[\"Document\"] = wos[\"Article Title\"].str.cat(wos[[\"Abstract\",\"keyword_all\"]].fillna(\"\"), sep=' - ')\n",
"# wos_kwd_test[\"BERT_KWDS\"] = wos_kwd_test[\"Document\"].map(kwd_extract)\n",
"\n",
"vectors = list()\n",
"vector_norms = list()\n",
1 year ago
"\n",
1 year ago
"for doc in nlp.pipe(wos['Document'].astype('unicode').values, batch_size=100,\n",
" n_process=4):\n",
" vectors.append(doc.vector)\n",
" vector_norms.append(doc.vector_norm)\n",
"\n",
1 year ago
"wos['vector'] = vectors\n",
"wos['vector_norm'] = vector_norms"
1 year ago
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
1 year ago
"execution_count": 87,
1 year ago
"outputs": [
{
"data": {
"text/plain": "<Axes: ylabel='Frequency'>"
1 year ago
},
1 year ago
"execution_count": 87,
1 year ago
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
1 year ago
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkQAAAGdCAYAAADzOWwgAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAtdUlEQVR4nO3df1RU9b7/8ReCQ6LOGCog159lqZRWWumczJPBAZW6lbZOZoUl5dEDntTy1z1e+3XWwSw1O5XecyuxW15/rJudkuUPxNR+oCZFqBVq6cEuDHhTGSUFhP39o8V8G387DszA5/lYa6/V7P1mz/sdnc3r7Nl7T4hlWZYAAAAM1izQDQAAAAQagQgAABiPQAQAAIxHIAIAAMYjEAEAAOMRiAAAgPEIRAAAwHgEIgAAYLywQDfQGNTW1qq4uFitW7dWSEhIoNsBAAAXwbIsHTt2TLGxsWrW7PzngAhEF6G4uFidOnUKdBsAAMAHBw8eVMeOHc9bQyC6CK1bt5b0y79Qu90e4G4AAMDFcLvd6tSpk+fv+PkQiC5C3cdkdrudQAQAQCNzMZe7cFE1AAAwHoEIAAAYj0AEAACMRyACAADGIxABAADjEYgAAIDxCEQAAMB4BCIAAGA8AhEAADAegQgAABiPQAQAAIxHIAIAAMYjEAEAAOMRiAAAgPHCAt0AgKal6/SsQLdwyQ7MTg50CwACjDNEAADAeAQiAABgPAIRAAAwHoEIAAAYj0AEAACMRyACAADGIxABAADjBTQQLVy4UH369JHdbpfdbpfT6dSaNWs820+ePKm0tDS1bdtWrVq10ogRI1RaWuq1j6KiIiUnJysiIkJRUVGaMmWKTp065VWzadMm9e3bV+Hh4erevbsyMzMbYjwAANBIBDQQdezYUbNnz1ZeXp527NihO++8U/fcc492794tSZo0aZI++ugjrVy5Ups3b1ZxcbGGDx/u+fmamholJyerqqpKn3/+uZYsWaLMzEzNmjXLU7N//34lJydr8ODBys/P18SJE/X4449r3bp1DT4vAAAITiGWZVmBbuLXIiMj9dJLL+n+++9X+/bttXTpUt1///2SpO+++069evVSbm6uBgwYoDVr1uiuu+5ScXGxoqOjJUmLFi3StGnTdOjQIdlsNk2bNk1ZWVnatWuX5z1Gjhypo0ePau3atRfVk9vtlsPhUHl5uex2u/+HBpoQnlQNIFhcyt/voLmGqKamRsuWLVNFRYWcTqfy8vJUXV2thIQET03Pnj3VuXNn5ebmSpJyc3PVu3dvTxiSpKSkJLndbs9ZptzcXK991NXU7eNsKisr5Xa7vRYAANB0BTwQ7dy5U61atVJ4eLjGjRunVatWKS4uTi6XSzabTW3atPGqj46OlsvlkiS5XC6vMFS3vW7b+WrcbrdOnDhx1p4yMjLkcDg8S6dOnfwxKgAACFIBD0Q9evRQfn6+tm3bpvHjx2v06NH65ptvAtrTjBkzVF5e7lkOHjwY0H4AAED9Cvi33dtsNnXv3l2S1K9fP33xxRdasGCBHnjgAVVVVeno0aNeZ4lKS0sVExMjSYqJidH27du99ld3F9qva06/M620tFR2u10tWrQ4a0/h4eEKDw/3y3wAACD4BfwM0elqa2tVWVmpfv36qXnz5srJyfFsKywsVFFRkZxOpyTJ6XRq586dKisr89RkZ2fLbrcrLi7OU/PrfdTV1O0DAAAgoGeIZsyYoaFDh6pz5846duyYli5dqk2bNmndunVyOBxKTU3V5MmTFRkZKbvdrgkTJsjpdGrAgAGSpMTERMXFxemRRx7RnDlz5HK5NHPmTKWlpXnO8IwbN06vvfaapk6dqjFjxmjjxo1asWKFsrIa350wAACgfgQ0EJWVlSklJUUlJSVyOBzq06eP1q1bp9/97neSpPnz56tZs2YaMWKEKisrlZSUpDfeeMPz86GhoVq9erXGjx8vp9Opli1bavTo0Xr++ec9Nd26dVNWVpYmTZqkBQsWqGPHjnrzzTeVlJTU4PMCAIDgFHTPIQpGPIcIuHg8hwhAsGiUzyECAAAIFAIRAAAwHoEIAAAYj0AEAACMRyACAADGIxABAADjEYgAAIDxCEQAAMB4BCIAAGA8AhEAADAegQgAABiPQAQAAIxHIAIAAMYjEAEAAOMRiAAAgPEIRAAAwHgEIgAAYDwCEQAAMB6BCAAAGI9ABAAAjEcgAgAAxiMQAQAA4xGIAACA8QhEAADAeAQiAABgPAIRAAAwHoEIAAAYj0AEAACMRyACAADGIxABAADjEYgAAIDxCEQAAMB4BCIAAGA8AhEAADAegQgAABiPQAQAAIxHIAIAAMYjEAEAAOMRiAAAgPEIRAAAwHgEIgAAYDwCEQAAMB6BCAAAGI9ABAAAjEcgAgAAxiMQAQAA4wU0EGVkZOiWW25R69atFRUVpXvvvVeFhYVeNXfccYdCQkK8lnHjxnnVFBUVKTk5WREREYqKitKUKVN06tQpr5pNmzapb9++Cg8PV/fu3ZWZmVnf4wEAgEYioIFo8+bNSktL09atW5Wdna3q6molJiaqoqLCq+6JJ55QSUmJZ5kzZ45nW01NjZKTk1VVVaXPP/9cS5YsUWZmpmbNmuWp2b9/v5KTkzV48GDl5+dr4sSJevzxx7Vu3boGmxUAAASvsEC++dq1a71eZ2ZmKioqSnl5eRo0aJBnfUREhGJiYs66j/Xr1+ubb77Rhg0bFB0drRtvvFEvvPCCpk2bpmeffVY2m02LFi1St27dNHfuXElSr1699Omnn2r+/PlKSkqqvwEBAECjENBAdLry8nJJUmRkpNf69957T++++65iYmJ0991369///d8VEREhScrNzVXv3r0VHR3tqU9KStL48eO1e/du3XTTTcrNzVVCQoLXPpOSkjRx4sSz9lFZWanKykrPa7fb7Y/xAASprtOzAt3CJTswOznQLQBNStAEotraWk2cOFG33Xabrr/+es/6UaNGqUuXLoqNjVVBQYGmTZumwsJCvf/++5Ikl8vlFYYkeV67XK7z1rjdbp04cUItWrTw2paRkaHnnnvO7zMCAIDgFDSBKC0tTbt27dKnn37qtX7s2LGef+7du7c6dOig+Ph4ff/997r66qvrpZcZM2Zo8uTJntdut1udOnWql/cCAACBFxS33aenp2v16tX6+OOP1bFjx/PW9u/fX5K0b98+SVJMTIxKS0u9aupe1113dK4au91+xtkhSQoPD5fdbvdaAABA0xXQQGRZltLT07Vq1Spt3LhR3bp1u+DP5OfnS5I6dOggSXI6ndq5c6fKyso8NdnZ2bLb7YqLi/PU5OTkeO0nOztbTqfTT5MAAIDGLKCBKC0tTe+++66WLl2q1q1by+VyyeVy6cSJE5Kk77//Xi+88ILy8vJ04MABffjhh0pJSdGgQYPUp08fSVJiYqLi4uL0yCOP6Ouvv9a6des0c+ZMpaWlKTw8XJI0btw4/fDDD5o6daq+++47vfHGG1qxYoUmTZoUsNkBAEDwCGggWrhwocrLy3XHHXeoQ4cOnmX58uWSJJvNpg0bNigxMVE9e/bUU089pREjRuijjz7y7CM0NFSrV69WaGionE6nHn74YaWkpOj555/31HTr1k1ZWVnKzs7WDTfcoLlz5+rNN9/klnsAACBJCrEsywp0E8HO7XbL4XCovLyc64mAC2iMt7A3Rtx2D1zYpfz9DoqLqgEAAAKJQAQAAIxHIAIAAMYjEAEAAOMRiAAAgPEIRAAAwHgEIgAAYDwCEQAAMB6BCAAAGI9ABAAAjEcgAgAAxiMQAQAA4xGIAACA8QhEAADAeAQiAABgPAIRAAAwHoEIAAAYj0AEAACMRyACAADGIxABAADjEYgAAIDxCEQAAMB4BCIAAGA8AhEAADAegQgAABiPQAQAAIxHIAIAAMYjEAEAAOMRiAAAgPEIRAAAwHgEIgAAYDwCEQAAMB6BCAAAGI9ABAAAjEcgAgAAxiMQAQAA4xGIAACA8QhEAADAeAQiAABgPAIRAAAwHoEIAAAYj0AEAACMRyACAADGIxABAADjEYgAAIDxAhqIMjIydMs
},
"metadata": {},
"output_type": "display_data"
1 year ago
}
],
"source": [
1 year ago
"wos['vector_norm'].plot(kind=\"hist\")"
1 year ago
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
1 year ago
"execution_count": 132,
"outputs": [],
"source": [
"from sklearn.manifold import TSNE\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"\n",
"vector_data = pd.DataFrame(wos[\"vector\"].to_list(), index = wos[record_col]).reset_index()\n",
"vector_data.head()\n",
"\n",
"labels = vector_data.values[:,0]\n",
"record_vectors = vector_data.values[:,1:]\n",
"\n",
"tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=42)\n",
"tnse_2d = tsne_model.fit_transform(record_vectors)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 133,
"outputs": [],
"source": [
"tnse_data = pd.DataFrame(tnse_2d, index=labels).reset_index()\n",
"tnse_data.columns=[record_col,\"TNSE-X\",\"TNSE-Y\"]"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 124,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) TNSE-X TNSE-Y\n0 WOS:000852293800024 42.244614 8.952363\n1 WOS:000540750000002 17.704300 -22.741098\n2 WOS:000600708400002 -23.244829 17.004990\n3 WOS:000511965100005 -17.139648 14.667156\n4 WOS:000663142500003 68.567207 3.378003",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>TNSE-X</th>\n <th>TNSE-Y</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000852293800024</td>\n <td>42.244614</td>\n <td>8.952363</td>\n </tr>\n <tr>\n <th>1</th>\n <td>WOS:000540750000002</td>\n <td>17.704300</td>\n <td>-22.741098</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000600708400002</td>\n <td>-23.244829</td>\n <td>17.004990</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000511965100005</td>\n <td>-17.139648</td>\n <td>14.667156</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000663142500003</td>\n <td>68.567207</td>\n <td>3.378003</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 124,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tnse_data.head()"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 134,
"outputs": [
{
"data": {
"text/plain": "<matplotlib.legend.Legend at 0x2a436d42a00>"
},
"execution_count": 134,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA0oAAAGwCAYAAACXTJW7AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3RURRvA4d9ueu+dhISaAAkQOkjvTZAiICgoFhREVBSxUAUsHxZEaSJFEaRL70V6DwQCaSSkkN57srv3+2Nhk81uGk3Rec7J0Tt37tzZS5LNuzPzjkySJAlBEARBEARBEARBQ/53d0AQBEEQBEEQBOGfRgRKgiAIgiAIgiAI5YhASRAEQRAEQRAEoRwRKAmCIAiCIAiCIJQjAiVBEARBEARBEIRyRKAkCIIgCIIgCIJQjgiUBEEQBEEQBEEQyjH8uzvwT6NSqbh79y5WVlbIZLK/uzuCIAiCIFSDJEnk5OTg7u6OXP54PwdWKpWUlJQ81nsIgvB4GBsbV/t3hAiUyrl79y6enp5/dzcEQRAEQXgAsbGx1KpV67G0LUkSiYmJZGZmPpb2BUF4/ORyOT4+PhgbG1dZVwRK5VhZWQHqX7TW1tZ/c28EQRAEQaiO7OxsPD09Ne/jj8P9IMnZ2Rlzc3Mx80QQnjL3Z44lJCTg5eVV5c+wCJTKuf/ArK2tRaAkCIIgCE+ZxxW8KJVKTZDk4ODwWO4hCMLj5+TkxN27d1EoFBgZGVVaVyRzEARBEARBqML9NUnm5uZ/c08EQXgY96fcKZXKKuuKQEkQBEEQBKGaxHQ7QXi61eRnWARKgiAIgiAIgiAI5YhASRAEQRAEQRAEoRwRKAmCIAiCIAiPxKxZs2jWrJnmeNy4cQwePPih2jx27BgymeyxpmV/FP0U/n1EoCQIgiAIgvAEKVUSZyLT+DMonjORaShV0hO575kzZzAwMKB///5P5H4A33//PatXr37s97l69SrPPvsszs7OmJqa4u3tzYgRI0hOTq7W9U+qn8LTRaQHFwRBEARBeEL2XU9g9s4QErIKNWVuNqbMHNiIPk3cHuu9V65cydtvv83KlSu5e/cu7u7uj/V+ADY2No/9HikpKXTv3p0BAwawf/9+bG1tiY6OZseOHeTl5VWrjSfRT+HpI0aUBEEQBEEQnoB91xN487fLWkESQGJWIW/+dpl91xMe271zc3P5448/ePPNN+nfv7/O6Mn96W27d+8mICAAU1NT2rZty/Xr1zV1Vq9eja2tLdu3b6d+/fqYmprSu3dvYmNjK7xv+SltKpWKBQsW4OPjg5mZGU2bNmXz5s1a1+zZs4cGDRpgZmZG165diY6OrvS1nTp1iqysLH7++WeaN2+Oj48PXbt25dtvv8XHx0dT78aNGwwYMABra2usrKzo2LEjkZGRD9TP+8/r8OHDtGzZEnNzc9q3b09oaKhW33bu3EmrVq0wNTXF0dGR5557TnOuqKiIqVOn4uHhgYWFBW3atOHYsWOa83fu3GHgwIHY2dlhYWFB48aN2bNnT6XPQni0RKAkCIIgCILwmClVErN3hqBvkt39stk7Qx7bNLyNGzfi6+tLw4YNGTNmDL/88guSpHuvDz74gIULF3LhwgWcnJwYOHCgZg8pgPz8fObNm8fatWs5deoUmZmZjBw5str9WLBgAWvXrmXp0qXcuHGDd999lzFjxnD8+HEAYmNjGTJkCAMHDiQoKIhXX32Vjz76qNI2XV1dUSgUbNu2Te9rAoiPj6dTp06YmJhw5MgRLl26xCuvvIJCoXigft73ySefsHDhQi5evIihoSGvvPKK5tzu3bt57rnn6NevH1euXOHw4cO0bt1ac37SpEmcOXOGDRs2cO3aNYYPH06fPn0IDw8HYOLEiRQVFfHXX38RHBzMl19+iaWlZdUPWXhkxNQ7QRAEQRCEx+x8VLrOSFJZEpCQVcj5qHTa1XV45PdfuXIlY8aMAaBPnz5kZWVx/PhxunTpolVv5syZ9OzZE4A1a9ZQq1Yttm3bxvPPPw+oN95dvHgxbdq00dTx8/Pj/PnzWkGAPkVFRcyfP59Dhw7Rrl07AOrUqcPJkydZtmwZnTt3ZsmSJdStW5eFCxcC0LBhQ02QUJG2bdvy8ccf88ILLzBhwgRat25Nt27deOmll3BxcQHgxx9/xMbGhg0bNmBkZARAgwYNHrif982bN09z/NFHH9G/f38KCwsxNTVl3rx5jBw5ktmzZ2vqN23aFICYmBhWrVpFTEyMZgrk1KlT2bdvH6tWrWL+/PnExMQwdOhQ/P39NX0QniwxoiQIgiAIgvCYJedUHCQ9SL2aCA0N5fz584waNQoAQ0NDRowYwcqVK3Xq3g8MAOzt7WnYsCE3b97UlBkaGtKqVSvNsa+vL7a2tlp1KhIREUF+fj49e/bE0tJS87V27VrNFLibN29qgjB9farIvHnzSExMZOnSpTRu3JilS5fi6+tLcHAwAEFBQXTs2FETJD1sP+8LCAjQ/L+bm3qN2f0EEkFBQXTv3l3vPYKDg1EqlTRo0EDrHsePH9fcY/LkyXz++ed06NCBmTNncu3atSr7LjxaYkRJEARBEAThMXO2Mn2k9Wpi5cqVKBQKreQNkiRhYmLC4sWLn1gig9zcXEA9Jc3Dw0PrnImJyUO37+DgwPDhwxk+fDjz58+nefPm/O9//2PNmjWYmZk9ln6WDbxkMhmgXt8EVHrP3NxcDAwMuHTpEgYGBlrn7k+ve/XVV+nduze7d+/mwIEDLFiwgIULF/L2229X+7UID0eMKAmCIAiCIDxmrX3scbMxRVbBeRnq7Hetfewf6X0VCgVr165l4cKFBAUFab6uXr2Ku7s769ev16p/9uxZzf9nZGQQFhaGn5+fVnsXL17UHIeGhpKZmalVpyKNGjXCxMSEmJgY6tWrp/Xl6ekJoJnGV1GfqsvY2Ji6detqst4FBARw4sQJrfVWD9PP6ggICODw4cN6zzVv3hylUklycrLOPVxdXTX1PD09mTBhAlu3buX9999nxYoV1b6/8PBEoCQIgl75xQq2XIrTu7A4t0jBh5uvkpT96KeICIIg/BsZyGXMHNgIQCdYun88c2AjDOQVhVIPZteuXWRkZDB+/HiaNGmi9TV06FCd6Xdz5szh8OHDXL9+nXHjxuHo6KiVDc7IyIi3336bc+fOcenSJcaNG0fbtm2rXJ8EYGVlxdSpU3n33XdZs2YNkZGRXL58mR9++IE1a9YAMGHCBMLDw/nggw8IDQ3l999/r3J/o127djFmzBh27dpFWFgYoaGh/O9//2PPnj0MGjQIUCdOyM7OZuTIkVy8eJHw8HB+/fVXnSx11e1ndcycOZP169czc+ZMbt68qbXWqkGDBowePZqXXnqJrVu3EhUVxfnz51mwYAG7d+8GYMqUKezfv5+oqCguX77M0aNHqxWQCo+OCJQE4V9IeW/awIOQJImpG69y6EYiS45FkpiVr1PHUC7Dy94cU0MDPS0IgiAI+vRp4saSMYG42mhPr3O1MWXJmMDHso/SypUr6dGjh97pdUOHDuXixYtaa1+++OIL3nnnHVq0aEFiYiI7d+7E2NhYc97c3Jxp06bxwgsv0KFDBywtLfnjjz+q3Z+5c+fy2WefsWDBAvz8/OjTpw+7d+/WpPH28vJiy5YtbN++naZNm7J06VLmz59faZuNGjXC3Nyc999/n2bNmtG2bVs2btzIzz//zIsvvgiop+UdOXKE3NxcOnfuTIsWLVixYkWFa5aq6md1dOnShU2bNrFjxw6aNWtGt27dtEbLVq1axUsvvcT7779Pw4YNGTx4MBcuXMDLywsApVLJxIkTNfdv0KABP/30U7XvLzw8mVRRHsX/qOzsbGxsbMjKysLa2vrv7o4gPJDbQ4di1b0HTm+9+UDX/37uDrXtzZixI4RNb7TD3lJ7Tvan24IZ3rIWTT3tHkV3BUEQHtrjfv8uLCwkKioKHx8fTE0fbh2RUiVxPiqd5JxCnK3U0+0e9UhSTR07doyuXbuSkZGBra2t3jqrV69mypQpZGZmPtG+CcKjVJOfZZHMQRD+hexffx3zMpl
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import seaborn as sns\n",
"wos_plot = wos.merge(tnse_data, on=record_col)\n",
"\n",
"g = sns.scatterplot(wos_plot[wos_plot[\"Domain_English\"]!='article-level classification'], x=\"TNSE-X\", y=\"TNSE-Y\", hue='Domain_English', s=1)\n",
"g.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 135,
1 year ago
"outputs": [
{
"data": {
1 year ago
"text/plain": " Publication Type Authors \n0 J Salucci, M; Arrebola, M; Shan, T; Li, MK \\\n1 J Huang, Y; Fu, ZT; Franzke, CLE \n2 J Feng, DC; Cetiner, B; Kakavand, MRA; Taciroglu, E \n3 J Zhao, YL; Dong, S; Jiang, FY; Soares, CG \n4 J Li, XH; Yang, DK; Yang, JS; Zheng, G; Han, GQ;... \n\n Book Authors Book Editors Book Group Authors \n0 NaN NaN NaN \\\n1 NaN NaN NaN \n2 NaN NaN NaN \n3 NaN NaN NaN \n4 NaN NaN NaN \n\n Author Full Names Book Author Full Names \n0 Salucci, Marco; Arrebola, Manuel; Shan, Tao; L... NaN \\\n1 Huang, Yu; Fu, Zuntao; Franzke, Christian L. E. NaN \n2 Feng, De-Cheng; Cetiner, Barbaros; Kakavand, M... NaN \n3 Zhao, Yuliang; Dong, Sheng; Jiang, Fengyuan; G... NaN \n4 Li, Xiaohui; Yang, Dongkai; Yang, Jingsong; Zh... NaN \n\n Group Authors Article Title \n0 NaN Artificial Intelligence: New Frontiers in Real... \\\n1 NaN Detecting causality from time series in a mach... \n2 NaN Data-Driven Approach to Predict the Plastic Hi... \n3 NaN System Reliability Analysis of an Offshore Jac... \n4 NaN Analysis of coastal wind speed retrieval from ... \n\n Source Title ... X_x Y_x \n0 IEEE TRANSACTIONS ON ANTENNAS AND PROPAGATION ... 42.244614 8.952363 \\\n1 CHAOS ... 17.704300 -22.741098 \n2 JOURNAL OF STRUCTURAL ENGINEERING ... -23.244829 17.004990 \n3 JOURNAL OF OCEAN UNIVERSITY OF CHINA ... -17.139648 14.667156 \n4 REMOTE SENSING OF ENVIRONMENT ... 68.567207 3.378003 \n\n X_y Y_y keyword_all \n0 42.244614 8.952363 IMAGING; THREE-DIMENSIONAL DISPLAYS; ELECTROMA... \\\n1 17.704300 -22.741098 STATE-SPACE RECONSTRUCTION; SURFACE AIR-TEMPER... \n2 -23.244829 17.004990 PLASTIC HINGE LENGTH; RC COLUMNS; MACHINE LEAR... \n3 -17.139648 14.667156 SYSTEM RELIABILITY; JACKET PLATFORM; BETA-UNZI... \n4 68.567207 3.378003 CYCLONE GNSS ; SEA SURFACE WIND SPEED; COASTAL... \n\n Document \n0 Artificial Intelligence: New Frontiers in Real... \\\n1 Detecting causality from time series in a mach... \n2 Data-Driven Approach to Predict the Plastic Hi... \n3 System Reliability Analysis of an Offshore Jac... \n4 Analysis of coastal wind speed retrieval from ... \n\n vector vector_norm TNSE-X \n0 [-1.8670139, -1.6925758, 0.48349068, -0.063790... 26.425585 35.139622 \\\n1 [-1.7312453, -0.4499114, -0.54250187, 0.690360... 28.921623 8.226096 \n2 [-2.3378334, -0.424522, -0.82274777, 1.622667,... 30.141471 -25.253866 \n3 [-2.4689128, -0.5432684, -0.429855, 0.6932005,... 30.455641 -18.432035 \n4 [-2.2039628, -0.79613304, -0.021788992, 0.7467... 26.722992 63.945808 \n\n TNSE-Y \n0 -19.611807 \n1 -14.699897 \n2 18.617361 \n3 17.831568 \n4 -21.907467 \n\n[5 rows x 91 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Publication Type</th>\n <th>Authors</th>\n <th>Book Authors</th>\n <th>Book Editors</th>\n <th>Book Group Authors</th>\n <th>Author Full Names</th>\n <th>Book Author Full Names</th>\n <th>Group Authors</th>\n <th>Article Title</th>\n <th>Source Title</th>\n <th>...</th>\n <th>X_x</th>\n <th>Y_x</th>\n <th>X_y</th>\n <th>Y_y</th>\n <th>keyword_all</th>\n <th>Document</th>\n <th>vector</th>\n <th>vector_norm</th>\n <th>TNSE-X</th>\n <th>TNSE-Y</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>J</td>\n <td>Salucci, M; Arrebola, M; Shan, T; Li, MK</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Salucci, Marco; Arrebola, Manuel; Shan, Tao; L...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Artificial Intelligence: New Frontiers in Real...</td>\n <td>IEEE TRANSACTIONS ON ANTENNAS AND PROPAGATION</td>\n <td>...</td>\n <td>42.244614</td>\n <td>8.952363</td>\n <td>42.244614</td>\n <td>8.952363</td>\n <td>IMAGING; THREE-DIMENSIONAL DISPLAYS; ELECTROMA...</td>\n <td>Artificial Intelligence: New Frontiers in Real...</td>\n <td>[-1.8670139, -1.6925758, 0.48349068, -0.063790...</td>\n <td>26.425585</td>\n <td>35.139622</td>\n <td>-19.611807</td>\n </tr>\n <tr>\n <th>1</th>\n <td>J</td>\n <td>Huang, Y; Fu, ZT; Franzke, CLE</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Huang, Yu; Fu, Zuntao; Franzke, Christian L. E.</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Detecting causality from time series in a mach...</td>\n <td>CHAOS</td>\n <td>...</td>\n <td>17.704300</td>\n <td>-22.741098</td>\n <td>17.704300</td>\n <td>-22.741098</td>\n <td>STATE-SPACE RECONSTRUCTION; SURFACE AIR-TEMPER...</td>\n <td>Detecting causality from time series in a mach...</td>\n <td>[-1.7312453, -0.4499114, -0.54250187, 0.690360...</td>\n <td>28.921623</td>\n <td>8.226096</td>\n <td>-14.699897</td>\n </tr>\n <tr>\n <th>2</th>\n <td>J</td>\n <td>Feng, DC; Cetiner, B; Kakavand, MRA; Taciroglu, E</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Feng, De-Cheng; Cetiner, Barbaros; Kakavand, M...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Data-Driven Approach to Predict the Plastic Hi...</td>\n <td>JOURNAL OF STRUCTURAL ENGINEERING</td>\n <td>...</td>\n <td>-23.244829</td>\n <td>17.004990</td>\n <td>-23.244829</td>\n <td>17.004990</td>\n <td>PLASTIC HINGE LENGTH; RC COLUMNS; MACHINE LEAR...</td>\n <td>Data-Driven Approach to Predict the Plastic Hi...</td>\n <td>[-2.3378334, -0.424522, -0.82274777, 1.622667,...</td>\n <td>30.141471</td>\n <td>-25.253866</td>\n <td>18.617361</td>\n </tr>\n <tr>\n <th>3</th>\n <td>J</td>\n <td>Zhao, YL; Dong, S; Jiang, FY; Soares, CG</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Zhao, Yuliang; Dong, Sheng; Jiang, Fengyuan; G...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>System Reliability Analysis of an Offshore Jac...</td>\n <td>JOURNAL OF OCEAN UNIVERSITY OF CHINA</td>\n <td>...</td>\n <td>-17.139648</td>\n <td>14.667156</td>\n <td>-17.139648</td>\n <td>14.667156</td>\n <td>SYSTEM RELIABILITY; JACKET PLATFORM; BETA-UNZI...</td>\n <td>System Reliability Analysis of an Offshore Jac...</td>\n <td>[-2.4689128, -0.5432684, -0.429855, 0.6932005,...</td>\n <td>30.455641</td>\n <td>
1 year ago
},
1 year ago
"execution_count": 135,
1 year ago
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
1 year ago
"wos_plot.head()"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 136,
"outputs": [],
"source": [
"wos_nlp=wos_plot[[record_col,\"Document\",\"keyword_all\",\"TNSE-X\",\"TNSE-Y\"]]"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 123,
"outputs": [
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAj4AAAGwCAYAAACpYG+ZAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3QU1dvA8e9s303vhTRICD30DtIFBQREKSJFBUFBRH4qL6AggmAXsStdVEBBRECU3qWHFlpCSAgQ0tsm2+f9Y2ElJEBQFJH7OWcP7MydO3dmN7vP3irJsiwjCIIgCIJwD1Dc6QIIgiAIgiD8U0TgIwiCIAjCPUMEPoIgCIIg3DNE4CMIgiAIwj1DBD6CIAiCINwzROAjCIIgCMI9QwQ+giAIgiDcM1R3ugD/Jg6HgwsXLuDh4YEkSXe6OIIgCIIgVIAsyxQWFhIaGopCceM6HRH4XOXChQuEh4ff6WIIgiAIgvAnnDt3jrCwsBumEYHPVTw8PADnjfP09LzDpREEQRAEoSIKCgoIDw93fY/fiAh8rnKlecvT01MEPoIgCIJwl6lINxXRuVkQBEEQhHuGCHwEQRAEQbhniMBHEARBEIR7hujjIwjCPcNut2O1Wu90MQRB+BM0Gs1Nh6pXhAh8BEH4z5NlmfT0dPLy8u50UQRB+JMUCgWVK1dGo9H8pXxE4CMIwn/elaAnMDAQg8EgJigVhLvMlQmGL168SERExF/6GxaBjyAI/2l2u90V9Pj5+d3p4giC8CcFBARw4cIFbDYbarX6T+cjOjcLgvCfdqVPj8FguMMlEQThr7jSxGW32/9SPiLwEQThniCatwTh7na7/oZF4CMIgiAIwj1DBD6CIAiCINwzROAjCIIg/KvMnz8fb2/vO12MW3ZtuV977TXq1atXoWNvJa3w14jARxAE4R4zZMgQJElCkiTUajVBQUF06tSJuXPn4nA47nTx6Nu3L6dOnbpt+V19vVc/unTpctvOUZ4XX3yRDRs2/K3nEG6dGM4uCIJwD+rSpQvz5s3Dbrdz6dIl1q5dy/PPP88PP/zAypUrUanu3NeDXq9Hr9ff1jyvXO/VtFrtbT3Htdzd3XF3d/9bzyHcOlHjIwiCcA/SarUEBwdTqVIlGjRowIQJE/jpp5/45ZdfmD9/PgCpqan06NEDd3d3PD096dOnD5cuXXLlcaV5Zu7cuURERODu7s6zzz6L3W7n7bffJjg4mMDAQN54441S537//fepU6cObm5uhIeH8+yzz1JUVOTaf70mo6+//pqoqCi8vLzo168fhYWFt3y9Vz98fHxc+yVJYvbs2fTq1QuDwUDVqlVZuXJlqTxWrlxJ1apV0el0tGvXjgULFiBJ0nVnBL+2+Wrz5s00adIENzc3vL29admyJSkpKaWO+SvXKFSMCHwEQRAEANq3b0/dunVZvnw5DoeDHj16kJOTw5YtW1i3bh1nzpyhb9++pY5JSkril19+Ye3atXz33XfMmTOHrl27kpaWxpYtW3jrrbd45ZVX2L17t+sYhULBrFmzOHbsGAsWLGDjxo28/PLLNyxbUlISK1asYNWqVaxatYotW7bw5ptv3tbrnzJlCn369OHw4cM8+OCDDBgwgJycHACSk5N55JFH6NmzJ4cOHWL48OFMnDixwnnbbDZ69uxJmzZtOHz4MLt27eLpp58uNUT7n7hGAZAFl/z8fBmQ8/Pz73RRBEG4TUpKSuSEhAS5pKTkThflX2Pw4MFyjx49yt3Xt29fuUaNGvJvv/0mK5VKOTU11bXv2LFjMiDv2bNHlmVZnjx5smwwGOSCggJXms6dO8tRUVGy3W53batWrZo8Y8aM65bn+++/l/38/FzP582bJ3t5ebmel3eel156SW7atGmFr1epVMpubm6lHm+88YYrDSC/8sorrudFRUUyIP/yyy+yLMvyuHHj5Nq1a5fKd+LEiTIg5+bmXrfcdevWlWVZlrOzs2VA3rx5c7ll/KvXeC+40d/yrXx/iz4+giAIgossy0iSxPHjxwkPDyc8PNy1r2bNmnh7e3P8+HEaN24MQFRUFB4eHq40QUFBKJXKUqtoBwUFkZGR4Xq+fv16ZsyYwYkTJygoKMBms2EymSguLr7uDNvXnickJKRUnjfTrl07Pvvss1LbfH19Sz2Pi4tz/d/NzQ1PT0/XOU6ePOm65iuaNGlS4fP7+voyZMgQOnfuTKdOnejYsSN9+vQhJCTEleavXqNQMaKpSxAEQXA5fvw4lStXrnD6a9dMujJS7NptV0aLnT17lm7duhEXF8eyZcvYv38/n3zyCQAWi+WWznMrI9Dc3NyIiYkp9bg28Pmr57iZefPmsWvXLlq0aMGSJUuIjY3l999//8fOLziJwEcQBEEAYOPGjRw5coTevXtTo0YNzp07x7lz51z7ExISyMvLo2bNmn/6HPv378fhcPDee+/RrFkzYmNjuXDhwu0o/t+qWrVq7Nu3r9S2vXv33nI+9evXZ/z48ezcuZPatWvz7bff3q4iChUkmroEQRDuQWazmfT09FLD2WfMmEG3bt0YNGgQCoWCOnXqMGDAAGbOnInNZuPZZ5+lTZs2NGrU6E+fNyYmBqvVykcffUT37t3ZsWMHn3/++W28svJdud6rqVQq/P39K3T88OHDef/99xk3bhxPPfUU8fHxrtFvFVlDKjk5mS+//JKHHnqI0NBQTp48yenTpxk0aNAtX4vw14gaH0EQhHvQ2rVrCQkJISoqii5durBp0yZmzZrFTz/9hFKpRJIkfvrpJ3x8fLjvvvvo2LEjVapUYcmSJX/pvHXr1uX999/nrbfeonbt2nzzzTfMmDHjNl3V9V253qsfrVq1qvDxlStX5ocffmD58uXExcXx2WefuUZ1VWQ+IIPBwIkTJ+jduzexsbE8/fTTjBw5kuHDh//paxL+HEmWZflOF+LfoqCgAC8vL/Lz8/H09LzTxREE4TYwmUwkJydTuXJldDrdnS6O8B/yxhtv8Pnnn5dqDhT+Pjf6W76V72/R1CUIgiAIFfDpp5/SuHFj/Pz82LFjB++88w6jRo2608USbpEIfARBEIS7Vmpq6g07WyckJBAREXFbznX69GmmTZtGTk4OERER/O9//2P8+PG3JW/hn/Ov6eOzdetWunfvTmhoKJIksWLFilL7ZVlm0qRJhISEoNfr6dixI6dPny6VJicnhwEDBuDp6Ym3tzdPPfVUqWnQBUEQhP+W0NBQ4uPjr/sIDQ29bef64IMPuHDhAiaTiVOnTvHqq6/e0TXNhD/nXxP4GI1G6tat65rP4Vpvv/02s2bN4vPPP2f37t24ubnRuXNnTCaTK82AAQM4duwY69atY9WqVWzdupWnn376n7oEQRAE4R+mUqnKzM9z9UMEJsK1/jXviAceeIAHHnig3H2yLDNz5kxeeeUVevToAcDChQsJCgpixYoV9OvXj+PHj7N27Vr27t3rGmr50Ucf8eCDD/Luu+/e1qhfEARBEIS707+mxudGkpOTSU9Pp2PHjq5tXl5eNG3alF27dgGwa9cuvL29S80v0bFjRxQKRanF8a5mNpspKCgo9RAEQRAE4b/rrgh8rkw6FRQUVGp7UFCQa196ejqBgYGl9qtUKnx9fctMWnXFjBkz8PLycj2uXpNGEARBEIT/nrsi8Pm7jB8/nvz8fNdDzMUgCIIgCP9td0XgExwcDMClS5dKbb906ZJrX3BwcJlVbG02Gzk5Oa4019JqtXh6epZ6CIIgCILw33VXBD6VK1cmODiYDRs2uLYVFBSwe/dumjdvDkDz5s3Jy8tj//79rjQbN27E4XDQtGnTf7zMgiAId5PXXnuNevXquZ4PGTKEnj17/qU8N2/ejCRJ5OXl/aV8buR2lFO4t/xrAp+ioiLXvAvg7NAcHx9PamoqkiQxZswYpk2bxsqVKzly5AiDBg0iNDTU9YavUaMGXbp0YdiwYezZs4cdO3YwatQo+vXrJ0Z0CYJwV9u1axdKpZKuXbv+Y+f88MMPXYtw/p0OHTrEQw89RGBgIDqdjqioKPr27VumBv96/qlyCv8d/5rh7Pv27aNdu3au52P
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"g = sns.kdeplot(\n",
" data=wos_plot[wos_plot[\"Domain_English\"]!='article-level classification'],\n",
" x=\"TNSE-X\", y=\"TNSE-Y\", hue='Domain_English',\n",
" thresh=.1,\n",
")"
1 year ago
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
1 year ago
"execution_count": 110,
1 year ago
"outputs": [
{
"data": {
1 year ago
"text/plain": "Index(['Publication Type', 'Authors', 'Book Authors', 'Book Editors',\n 'Book Group Authors', 'Author Full Names', 'Book Author Full Names',\n 'Group Authors', 'Article Title', 'Source Title', 'Book Series Title',\n 'Book Series Subtitle', 'Language', 'Document Type', 'Conference Title',\n 'Conference Date', 'Conference Location', 'Conference Sponsor',\n 'Conference Host', 'Author Keywords', 'Keywords Plus', 'Abstract',\n 'Addresses', 'Affiliations', 'Reprint Addresses', 'Email Addresses',\n 'Researcher Ids', 'ORCIDs', 'Funding Orgs', 'Funding Name Preferred',\n 'Funding Text', 'Cited References', 'Cited Reference Count',\n 'Times Cited, WoS Core', 'Times Cited, All Databases',\n '180 Day Usage Count', 'Since 2013 Usage Count', 'Publisher',\n 'Publisher City', 'Publisher Address', 'ISSN', 'eISSN', 'ISBN',\n 'Journal Abbreviation', 'Journal ISO Abbreviation', 'Publication Date',\n 'Publication Year', 'Volume', 'Issue', 'Part Number', 'Supplement',\n 'Special Issue', 'Meeting Abstract', 'Start Page', 'End Page',\n 'Article Number', 'DOI', 'DOI Link', 'Book DOI', 'Early Access Date',\n 'Number of Pages', 'WoS Categories', 'Web of Science Index',\n 'Research Areas', 'IDS Number', 'Pubmed Id', 'Open Access Designations',\n 'Highly Cited Status', 'Hot Paper Status', 'Date of Export',\n 'UT (Unique WOS ID)', 'Web of Science Record', 'issn_var', 'issn',\n 'Domain_English', 'Field_English', 'SubField_English', '2.00 SEQ',\n 'Source_title', 'srcid', 'issn_type', 'X_x', 'Y_x', 'X_y', 'Y_y',\n 'TNSE-X_x', 'TNSE-Y_x', 'TNSE-X_y', 'TNSE-Y_y', 'TNSE-X', 'TNSE-Y'],\n dtype='object')"
1 year ago
},
1 year ago
"execution_count": 110,
1 year ago
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
1 year ago
"wos.columns"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 105,
"outputs": [
{
"ename": "KeyError",
"evalue": "'TNSE-X'",
"output_type": "error",
"traceback": [
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[1;31mKeyError\u001B[0m Traceback (most recent call last)",
"File \u001B[1;32m~\\Anaconda3\\envs\\MOME_BIGDATA\\lib\\site-packages\\pandas\\core\\indexes\\base.py:3649\u001B[0m, in \u001B[0;36mIndex.get_loc\u001B[1;34m(self, key)\u001B[0m\n\u001B[0;32m 3648\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[1;32m-> 3649\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_engine\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_loc\u001B[49m\u001B[43m(\u001B[49m\u001B[43mcasted_key\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 3650\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m err:\n",
"File \u001B[1;32m~\\Anaconda3\\envs\\MOME_BIGDATA\\lib\\site-packages\\pandas\\_libs\\index.pyx:147\u001B[0m, in \u001B[0;36mpandas._libs.index.IndexEngine.get_loc\u001B[1;34m()\u001B[0m\n",
"File \u001B[1;32m~\\Anaconda3\\envs\\MOME_BIGDATA\\lib\\site-packages\\pandas\\_libs\\index.pyx:176\u001B[0m, in \u001B[0;36mpandas._libs.index.IndexEngine.get_loc\u001B[1;34m()\u001B[0m\n",
"File \u001B[1;32mpandas\\_libs\\hashtable_class_helper.pxi:7080\u001B[0m, in \u001B[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001B[1;34m()\u001B[0m\n",
"File \u001B[1;32mpandas\\_libs\\hashtable_class_helper.pxi:7088\u001B[0m, in \u001B[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001B[1;34m()\u001B[0m\n",
"\u001B[1;31mKeyError\u001B[0m: 'TNSE-X'",
"\nThe above exception was the direct cause of the following exception:\n",
"\u001B[1;31mKeyError\u001B[0m Traceback (most recent call last)",
"Cell \u001B[1;32mIn[105], line 1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m \u001B[43mwos\u001B[49m\u001B[43m[\u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mTNSE-X\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m]\u001B[49m\n",
"File \u001B[1;32m~\\Anaconda3\\envs\\MOME_BIGDATA\\lib\\site-packages\\pandas\\core\\frame.py:3745\u001B[0m, in \u001B[0;36mDataFrame.__getitem__\u001B[1;34m(self, key)\u001B[0m\n\u001B[0;32m 3743\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcolumns\u001B[38;5;241m.\u001B[39mnlevels \u001B[38;5;241m>\u001B[39m \u001B[38;5;241m1\u001B[39m:\n\u001B[0;32m 3744\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_getitem_multilevel(key)\n\u001B[1;32m-> 3745\u001B[0m indexer \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcolumns\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_loc\u001B[49m\u001B[43m(\u001B[49m\u001B[43mkey\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 3746\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m is_integer(indexer):\n\u001B[0;32m 3747\u001B[0m indexer \u001B[38;5;241m=\u001B[39m [indexer]\n",
"File \u001B[1;32m~\\Anaconda3\\envs\\MOME_BIGDATA\\lib\\site-packages\\pandas\\core\\indexes\\base.py:3651\u001B[0m, in \u001B[0;36mIndex.get_loc\u001B[1;34m(self, key)\u001B[0m\n\u001B[0;32m 3649\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_engine\u001B[38;5;241m.\u001B[39mget_loc(casted_key)\n\u001B[0;32m 3650\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m err:\n\u001B[1;32m-> 3651\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m(key) \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01merr\u001B[39;00m\n\u001B[0;32m 3652\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mTypeError\u001B[39;00m:\n\u001B[0;32m 3653\u001B[0m \u001B[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001B[39;00m\n\u001B[0;32m 3654\u001B[0m \u001B[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001B[39;00m\n\u001B[0;32m 3655\u001B[0m \u001B[38;5;66;03m# the TypeError.\u001B[39;00m\n\u001B[0;32m 3656\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_check_indexing_error(key)\n",
"\u001B[1;31mKeyError\u001B[0m: 'TNSE-X'"
]
}
],
"source": [
"wos[\"TNSE-X\"]"
],
1 year ago
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
2 years ago
{
"cell_type": "code",
2 years ago
"execution_count": 8,
"metadata": {},
2 years ago
"outputs": [],
"source": [
2 years ago
"geotext = GeoText()\n",
"\n",
"def extract_location(input_text, key='countries'):\n",
" anomalies = {\"Malta\":\"Malta\",\n",
" \"Mongolia\":\"Mongolia\",\n",
" \"Quatar\":\"Qatar\",\n",
" \"Qatar\":\"Qatar\",\n",
" \"Ethiop\":\"Ethiopia\",\n",
" \"Nigeria\":\"Nigeria\",\n",
" \"BELAR\":\"Belarus\",\n",
" \"Venezuela\":\"Venezuela\",\n",
" \"Cyprus\":\"Cyprus\",\n",
" \"Ecuador\":\"Ecuador\",\n",
" \"U Arab\":\"United Arab Emirates\",\n",
" \"Syria\":\"Syria\",\n",
" \"Uganda\":\"Uganda\",\n",
" \"Yemen\":\"Yemen\",\n",
" \"Mali\":\"Mali\",\n",
" \"Senegal\":\"Senegal\",\n",
" \"Vatican\":\"Vatican\",\n",
" \"Uruguay\":\"Uruguay\",\n",
" \"Panama\":\"Panama\",\n",
" \"Fiji\":\"Fiji\",\n",
" \"Faroe\":\"Faroe Islands\",\n",
" \"Macedonia\":\"Macedonia\",\n",
" 'Mozambique':'Mozambique',\n",
" \"Kuwait\":\"Kuwait\",\n",
" \"Libya\":\"Libya\",\n",
" \"Turkiy\":\"Turkey\",\n",
" \"Liberia\":\"Liberia\",\n",
" \"Namibia\":\"Namibia\",\n",
" \"Ivoire\":\"Ivory Coast\",\n",
" \"Guatemala\":\"Gutemala\",\n",
" \"Paraguay\":\"Paraguay\",\n",
" \"Honduras\":\"Honduras\",\n",
" \"Nicaragua\":\"Nicaragua\",\n",
" \"Trinidad\":\"Trinidad & Tobago\",\n",
" \"Liechtenstein\":\"Liechtenstein\",\n",
" \"Greenland\":\"Denmark\"}\n",
"\n",
" extracted = geotext.extract(input_text=input_text)\n",
" found = extracted[key].keys()\n",
" if len(sorted(found))>0:\n",
" return sorted(found)[0]\n",
" elif key=='countries':\n",
" for i in ['Scotland','Wales','England']:\n",
" if i in input_text:\n",
" return 'United Kingdom'\n",
" for j in anomalies.keys():\n",
" if j in input_text:\n",
" return anomalies.get(j)\n",
" else:\n",
" return None\n",
"\n",
"with open('../eu_members.txt',\"r\") as f:\n",
" eu_countries=f.readline().split(\",\")\n",
" eu_countries=[i.strip() for i in eu_countries]\n",
"\n",
"def country_type(country):\n",
" if country in eu_countries:\n",
" return \"EU\"\n",
" elif country==\"China\":\n",
" return \"China\"\n",
" else:\n",
" return \"Other\"\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"locations = wos.groupby(record_col)[\"Addresses\"].apply(lambda x: x.str.split('[')).explode().reset_index().drop(columns=\"level_1\")\n",
"locations = locations[locations[\"Addresses\"]!=\"\"].copy()\n",
"locations[\"Address\"] = locations[\"Addresses\"].apply(lambda x:x.split(\"]\")[-1])\n",
"locations[\"Authors_of_address\"] = locations[\"Addresses\"].apply(lambda x:x.split(\"]\")[0])\n",
"locations[\"Country\"]=locations['Address'].apply(lambda x: extract_location(input_text=x, key='countries'))\n",
"locations[\"City\"]=locations['Address'].apply(lambda x: extract_location(input_text=x, key='cities'))\n",
"locations[\"Country_Type\"] = locations[\"Country\"].apply(lambda x: country_type(x))"
]
2 years ago
},
{
"cell_type": "code",
2 years ago
"execution_count": 10,
"metadata": {},
2 years ago
"outputs": [
{
2 years ago
"data": {
"text/plain": " UT (Unique WOS ID) Address \n1 WOS:000209536100003 BGI HK Ltd, GigaSci, Tai Po, Hong Kong, Peopl... \\\n2 WOS:000209536100003 Nat Hist Museum, London SW7 5BD, England; \n3 WOS:000209536100003 Pensoft Publishers, Sofia, Bulgaria; \n4 WOS:000209536100003 Nat Hist Museum, Natl Museum, Sofia, Bulgaria; \n5 WOS:000209536100003 Bulgarian Acad Sci, Inst Biodivers & Ecosyst ... \n\n Country City Country_Type Institution \n1 China Hong Kong China BGI HK Ltd \n2 United Kingdom London Other Nat Hist Museum \n3 Bulgaria Sofia EU Pensoft Publishers \n4 Bulgaria Sofia EU Nat Hist Museum \n5 Bulgaria Rees EU Bulgarian Acad Sci ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Address</th>\n <th>Country</th>\n <th>City</th>\n <th>Country_Type</th>\n <th>Institution</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>1</th>\n <td>WOS:000209536100003</td>\n <td>BGI HK Ltd, GigaSci, Tai Po, Hong Kong, Peopl...</td>\n <td>China</td>\n <td>Hong Kong</td>\n <td>China</td>\n <td>BGI HK Ltd</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000209536100003</td>\n <td>Nat Hist Museum, London SW7 5BD, England;</td>\n <td>United Kingdom</td>\n <td>London</td>\n <td>Other</td>\n <td>Nat Hist Museum</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000209536100003</td>\n <td>Pensoft Publishers, Sofia, Bulgaria;</td>\n <td>Bulgaria</td>\n <td>Sofia</td>\n <td>EU</td>\n <td>Pensoft Publishers</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000209536100003</td>\n <td>Nat Hist Museum, Natl Museum, Sofia, Bulgaria;</td>\n <td>Bulgaria</td>\n <td>Sofia</td>\n <td>EU</td>\n <td>Nat Hist Museum</td>\n </tr>\n <tr>\n <th>5</th>\n <td>WOS:000209536100003</td>\n <td>Bulgarian Acad Sci, Inst Biodivers &amp; Ecosyst ...</td>\n <td>Bulgaria</td>\n <td>Rees</td>\n <td>EU</td>\n <td>Bulgarian Acad Sci</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
2 years ago
}
],
"source": [
2 years ago
"univ_locations = locations[[record_col,\"Address\",\"Country\",\"City\",\"Country_Type\"]].copy()\n",
"univ_locations[\"Institution\"] = univ_locations[\"Address\"].apply(lambda x: x.split(\",\")[0])\n",
"univ_locations.head()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "Country\nChina 21063\nUnited States 5913\nGermany 4179\nItaly 3195\nFrance 2767\n ... \nFaroe Islands 1\nHonduras 1\nVatican 1\nMacedonia 1\nJamaica 1\nName: count, Length: 137, dtype: int64"
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
2 years ago
],
2 years ago
"source": [
"locations[\"Country\"].value_counts()"
]
2 years ago
},
{
"cell_type": "code",
2 years ago
"execution_count": 12,
"metadata": {},
2 years ago
"outputs": [
{
"data": {
2 years ago
"text/plain": "Country_Type\nEU 21228\nChina 21063\nOther 20404\nName: count, dtype: int64"
2 years ago
},
2 years ago
"execution_count": 12,
2 years ago
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2 years ago
"locations[\"Country_Type\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Country Country_Type Author_name \n0 WOS:000209536100003 Bulgaria EU Stoev, Pavel \\\n1 WOS:000209536100003 Bulgaria EU Penev, Lyubomir \n2 WOS:000209536100003 Bulgaria EU Stoev, Pavel \n3 WOS:000209536100003 Bulgaria EU Penev, Lyubomir \n4 WOS:000209536100003 China China Edmunds, Scott C. \n... ... ... ... ... \n173441 WOS:000947693400001 China China Peng, Sihua \n173442 WOS:000947693400001 China China Shen, Zhehan \n173443 WOS:000947693400001 China China Shen, Zhehan \n173444 WOS:000947693400001 China China Liu, Taigang \n173445 WOS:000947693400001 Spain EU Jiang, Linhua \n\n author_str_id \n0 stoevpavel \n1 penevlyubomir \n2 stoevpavel \n3 penevlyubomir \n4 edmundsscottc \n... ... \n173441 pengsihua \n173442 shenzhehan \n173443 shenzhehan \n173444 liutaigang \n173445 jianglinhua \n\n[173446 rows x 5 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Country</th>\n <th>Country_Type</th>\n <th>Author_name</th>\n <th>author_str_id</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000209536100003</td>\n <td>Bulgaria</td>\n <td>EU</td>\n <td>Stoev, Pavel</td>\n <td>stoevpavel</td>\n </tr>\n <tr>\n <th>1</th>\n <td>WOS:000209536100003</td>\n <td>Bulgaria</td>\n <td>EU</td>\n <td>Penev, Lyubomir</td>\n <td>penevlyubomir</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000209536100003</td>\n <td>Bulgaria</td>\n <td>EU</td>\n <td>Stoev, Pavel</td>\n <td>stoevpavel</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000209536100003</td>\n <td>Bulgaria</td>\n <td>EU</td>\n <td>Penev, Lyubomir</td>\n <td>penevlyubomir</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000209536100003</td>\n <td>China</td>\n <td>China</td>\n <td>Edmunds, Scott C.</td>\n <td>edmundsscottc</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>173441</th>\n <td>WOS:000947693400001</td>\n <td>China</td>\n <td>China</td>\n <td>Peng, Sihua</td>\n <td>pengsihua</td>\n </tr>\n <tr>\n <th>173442</th>\n <td>WOS:000947693400001</td>\n <td>China</td>\n <td>China</td>\n <td>Shen, Zhehan</td>\n <td>shenzhehan</td>\n </tr>\n <tr>\n <th>173443</th>\n <td>WOS:000947693400001</td>\n <td>China</td>\n <td>China</td>\n <td>Shen, Zhehan</td>\n <td>shenzhehan</td>\n </tr>\n <tr>\n <th>173444</th>\n <td>WOS:000947693400001</td>\n <td>China</td>\n <td>China</td>\n <td>Liu, Taigang</td>\n <td>liutaigang</td>\n </tr>\n <tr>\n <th>173445</th>\n <td>WOS:000947693400001</td>\n <td>Spain</td>\n <td>EU</td>\n <td>Jiang, Linhua</td>\n <td>jianglinhua</td>\n </tr>\n </tbody>\n</table>\n<p>173446 rows × 5 columns</p>\n</div>"
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
2 years ago
],
2 years ago
"source": [
"author_locations = locations.groupby([record_col,\"Country\",\"Country_Type\"])[\"Authors_of_address\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_3\")\n",
"author_locations[\"Author_name\"] = author_locations[\"Authors_of_address\"].str.strip()\n",
"author_locations = author_locations.drop(columns=\"Authors_of_address\")\n",
"author_locations[\"author_str_id\"] = author_locations[\"Author_name\"].apply(lambda x:''.join(filter(str.isalnum, x.lower())))\n",
"author_locations"
]
2 years ago
},
{
"cell_type": "code",
2 years ago
"execution_count": 14,
"metadata": {},
2 years ago
"outputs": [
{
2 years ago
"data": {
"text/plain": "8925"
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
2 years ago
}
],
"source": [
2 years ago
"author_primary_region = author_locations.sort_values(by=\"Country_Type\").drop_duplicates(subset=[record_col,\"author_str_id\"])\n",
"# author_primary_region\n",
"\n",
"china=author_primary_region[author_primary_region[\"Country_Type\"]==\"China\"][record_col].unique()\n",
"eu=author_primary_region[author_primary_region[\"Country_Type\"]==\"EU\"][record_col].unique()\n",
"\n",
"len(wos[((wos[record_col].isin(china))\n",
" &\n",
" (wos[record_col].isin(eu)))])"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "9889"
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
2 years ago
],
2 years ago
"source": [
"len(wos)"
]
2 years ago
},
{
"cell_type": "code",
2 years ago
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"affiliations = wos.groupby(record_col)[\"Affiliations\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
"# affiliations[affiliations[\"Affiliations\"].str.lower().str.contains(\"chinese academy\", na=False, regex=True)][\"Affiliations\"].value_counts()\n",
"affiliations[\"Affiliations\"] = affiliations[\"Affiliations\"].str.strip().str.upper().fillna(\"UNKNOWN\")\n",
"affiliations = affiliations.drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
2 years ago
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2 years ago
"76485 72581\n"
2 years ago
]
2 years ago
}
],
"source": [
"aff_ = wos.groupby(record_col)[\"Affiliations\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
"loc_ = wos.groupby(record_col)[\"Addresses\"].apply(lambda x: x.str.split('[')).explode().reset_index().drop(columns=\"level_1\")\n",
"print(len(aff_),len(loc_))"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
2 years ago
{
2 years ago
"data": {
"text/plain": "[['IDAHO'],\n ['ICREA'],\n ['CEA'],\n ['AGROPARISTECH'],\n ['LENOVO'],\n ['RIKEN'],\n ['MICROSOFT'],\n ['GLAXOSMITHKLINE'],\n ['UNICANCER'],\n ['INRIA'],\n ['CIBERESP'],\n ['SINOPEC'],\n ['PHILIPS'],\n ['CIRAD'],\n ['VITO'],\n ['IMEC'],\n ['ILLUMINA'],\n ['EURECOM'],\n ['BAIDU'],\n ['CIBEREHD'],\n ['UNKNOWN'],\n ['BAYCREST'],\n ['NOVARTIS'],\n ['ITER'],\n ['PELIN'],\n ['INRAE'],\n ['ASTRAZENECA'],\n ['ERICSSON'],\n ['IDIBAPS'],\n ['CGIAR'],\n ['UNILEVER'],\n ['GENENTECH'],\n ['TENCENT'],\n ['NICTA'],\n ['QUALCOMM'],\n ['INESC-ID'],\n ['CIBERES'],\n ['ALCATEL-LUCENT'],\n ['TEAGASC'],\n ['ABB'],\n ['HEWLETT-PACKARD'],\n ['AT&T'],\n ['RIGSHOSPITALET'],\n ['FORTISS'],\n ['AMAZON.COM'],\n ['BASF'],\n ['BOSCH'],\n ['CIBERSAM'],\n ['EURATOM'],\n ['UNINETTUNO'],\n ['E-ON'],\n ['DELPHI'],\n ['BIOGEN'],\n ['SAMSUNG'],\n ['INTERDIGITAL'],\n ['SYNGENTA'],\n ['CIBERONC'],\n ['IRTA'],\n ['MICA'],\n ['MEDTRONIC'],\n ['IFREMER'],\n ['DELTARES'],\n ['PROFIL'],\n ['SANOFI-AVENTIS'],\n ['REGENERON'],\n ['YUTONG'],\n ['CIBERBBN'],\n ['KAKAO'],\n ['DNV'],\n ['SCHLUMBERGER'],\n ['ITALFARMACO'],\n ['CYBERNETICA'],\n ['ZTE'],\n ['NAVER'],\n ['VOLVO'],\n ['CHANGHONG'],\n ['CINTECX'],\n ['VINUNIVERSITY'],\n ['SERVIER'],\n ['CIBERCV'],\n ['IMELDAZIEKENHUIS'],\n ['DIAKONESSENHUIS'],\n ['ADVENTHEALTH'],\n ['ALLIANCE'],\n ['AUDENCIA'],\n ['SINTEF'],\n ['SAP'],\n ['ELEKTA'],\n ['ELSEVIER'],\n ['CIBEROBN'],\n ['PFIZER'],\n ['ABBVIE'],\n ['NAVARRABIOMED'],\n ['BYD'],\n ['INSPUR'],\n ['CIBERNED'],\n ['SHANDONG', 'UNIVERSITY'],\n ['HEBEI', 'UNIVERSITY'],\n ['BOGAZICI', 'UNIVERSITY'],\n ['DOGUS', 'UNIVERSITY'],\n ['GAZIANTEP', 'UNIVERSITY'],\n ['ANKARA', 'UNIVERSITY'],\n ['DUMLUPINAR', 'UNIVERSITY'],\n ['GAZI', 'UNIVERSITY'],\n ['BOSTON', 'UNIVERSITY'],\n ['BRANDEIS', 'UNIVERSITY'],\n ['CARLETON', 'UNIVERSITY'],\n ['NANJING', 'UNIVERSITY'],\n ['COLUMBIA', 'UNIVERSITY'],\n ['HELMHOLTZ', 'ASSOCIATION'],\n ['DUKE', 'UNIVERSITY'],\n ['HAMPTON', 'UNIVERSITY'],\n ['HARVARD', 'UNIVERSITY'],\n ['KOBE', 'UNIVERSITY'],\n ['KYOTO', 'UNIVERSITY'],\n ['LANCASTER', 'UNIVERSITY'],\n ['SORBONNE', 'UNIVERSITE'],\n ['LUND', 'UNIVERSITY'],\n ['AIX-MARSEILLE', 'UNIVERSITE'],\n ['MCGILL', 'UNIVERSITY'],\n ['NAGOYA', 'UNIVERSITY'],\n ['OKAYAMA', 'UNIVERSITY'],\n ['OSAKA', 'UNIVERSITY'],\n ['RITSUMEIKAN', 'UNIVERSITY'],\n ['SHINSHU', 'UNIVERSITY'],\n ['UNIVERSITAT', 'SIEGEN'],\n ['STANFORD', 'UNIVERSITY'],\n ['STOCKHOLM', 'UNIVERSITY'],\n ['TUFTS', 'UNIVERSITY'],\n ['UPPSALA', 'UNIVERSITY'],\n ['WASEDA', 'UNIVERSITY'],\n ['YALE', 'UNIVERSITY'],\n ['HIROSHIMA', 'UNIVERSITY'],\n ['MANHATTAN', 'COLLEGE'],\n ['JAGIELLONIAN', 'UNIVERSITY'],\n ['FUDAN', 'UNIVERSITY'],\n ['YANTAI', 'UNIVERSITY'],\n ['UNIVERSITY', 'OSNABRUCK'],\n ['PEKING', 'UNIVERSITY'],\n ['TSINGHUA', 'UNIVERSITY'],\n ['SYRACUSE', 'UNIVERSITY'],\n ['ZHEJIANG', 'UNIVERSITY'],\n ['MCMASTER', 'UNIVERSITY'],\n ['ETH', 'ZURICH'],\n ['TUSCIA', 'UNIVERSITY'],\n ['LISHUI', 'UNIVERSITY'],\n ['LEGEND', 'HOLDINGS'],\n ['WUHAN', 'UNIVERSITY'],\n ['GHENT', 'UNIVERSITY'],\n ['SHANGHAI', 'UNIVERSITY'],\n ['JILIN', 'UNIVERSITY'],\n ['ULSTER', 'UNIVERSITY'],\n ['JIANGNAN', 'UNIVERSITY'],\n ['KU', 'LEUVEN'],\n ['HOCHSCHULE', 'AALEN'],\n ['SHAOYANG', 'UNIVERSITY'],\n ['HUNAN', 'UNIVERSITY'],\n ['KYUSHU', 'UNIVERSITY'],\n ['TONGJI', 'UNIVERSITY'],\n ['TAMPERE', 'UNIVERSITY'],\n ['AALTO', 'UNIVERSITY'],\n ['OBUDA', 'UNIVERSITY'],\n ['PANJAB', 'UNIVERSITY'],\n ['KOREA', 'UNIVERSITY'],\n ['VILNIUS', 'UNIVERSITY'],\n ['CHULALONGKORN', 'UNIVERSITY'],\n ['CUKUROVA', 'UNIVERSITY'],\n ['BRUNEL', 'UNIVERSITY'],\n ['BAYLOR', 'UNIVERSITY'],\n ['BROWN', 'UNIVERSITY'],\n ['CORNELL', 'UNIVERSITY'],\n ['FAIRFIELD', 'UNIVERSITY'],\n ['NORTHEASTERN', 'UNIVERSITY'],\n ['NORTHWESTERN', 'UNIVERSITY'],\n ['PRINCETON', 'UNIVERSITY'],\n ['PURDUE', 'UNIVERSITY'],\n ['RICE', 'UNIVERSITY'],\n ['ROCKEFELLER', 'UNIVERSITY'],\n ['VANDERBILT', 'UNIVERSITY'],\n ['CAIRO', 'UNIVERSITY'],\n ['FAYOUM', 'UNIVERSITY'],\n ['HELWAN', 'UNIVERSITY'],\n ['SHIRAZ', 'UNIVERSITY'],\n ['GAZIOSMANPASA', 'UNIVERSITY'],\n ['ADIYAMAN', 'UNIVERSITY'],\n ['MERSIN', '
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"unique_inst = sorted([i.split(\" \") for i in list(affiliations[\"Affiliations\"].unique())], key=len)\n",
"# unique_inst = [[''.join(filter(str.isalnum, i)) for i in i_list] for i_list in unique_inst]\n",
"unique_inst = [[i.strip(\",\").strip(\"(\").strip(\")\") for i in i_list] for i_list in unique_inst]\n",
"unique_inst"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"def institution_chunk_norris(text):\n",
" for i in unique_inst:\n",
" text_split=text.split(\" \")\n",
" text_split=[i.strip(\",\").strip(\"(\").strip(\")\") for i in text_split]\n",
" overlap = all(token in text_split for token in i)\n",
" if overlap:\n",
" return (\" \".join(i))\n",
" return \"ERROR\""
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"affiliations[\"Affiliations_merged\"] = affiliations[\"Affiliations\"].apply(lambda x: institution_chunk_norris(x))"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "Affiliations\nCHINESE ACADEMY OF SCIENCES 1188\nUDICE-FRENCH RESEARCH UNIVERSITIES 647\nCENTRE NATIONAL DE LA RECHERCHE SCIENTIFIQUE (CNRS) 640\nHELMHOLTZ ASSOCIATION 427\nUNIVERSITY OF CHINESE ACADEMY OF SCIENCES, CAS 411\n ... \nIMT NORD EUROPE 1\nSANGMYUNG UNIVERSITY 1\nINDIANA UNIVERSITY PURDUE UNIVERSITY FORT WAYNE 1\nJAHANGIRNAGAR UNIVERSITY 1\nSAINT JAMES'S UNIVERSITY HOSPITAL 1\nName: count, Length: 4884, dtype: int64"
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
2 years ago
}
],
"source": [
2 years ago
"affiliations[\"Affiliations\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "Affiliations_merged\nCHINESE ACADEMY OF SCIENCES 1725\nNANJING UNIVERSITY 737\nSHANGHAI UNIVERSITY 667\nUDICE-FRENCH RESEARCH UNIVERSITIES 647\nCENTRE NATIONAL DE LA RECHERCHE SCIENTIFIQUE CNRS 640\n ... \nULVAC INC. 1\nNATIONAL METROLOGY INSTITUTE OF JAPAN 1\nSHEFFIELD HALLAM UNIVERSITY 1\nGLOBAL INSTITUTE FOR WATER SECURITY 1\nSAINT JAMES'S UNIVERSITY HOSPITAL 1\nName: count, Length: 4241, dtype: int64"
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
2 years ago
],
2 years ago
"source": [
"affiliations[\"Affiliations_merged\"].value_counts()"
]
2 years ago
},
{
"cell_type": "code",
2 years ago
"execution_count": 23,
"metadata": {},
2 years ago
"outputs": [
{
2 years ago
"data": {
"text/plain": "Empty DataFrame\nColumns: [UT (Unique WOS ID), Affiliations, Affiliations_merged]\nIndex: []",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Affiliations</th>\n <th>Affiliations_merged</th>\n </tr>\n </thead>\n <tbody>\n </tbody>\n</table>\n</div>"
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"affiliations[affiliations[\"Affiliations_merged\"]==\"ERROR\"]"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"from nltk.metrics import edit_distance\n",
"from nltk.metrics import edit_distance_align\n",
"#results = df.apply(lambda x: edit_distance(x[\"column1\"], x[\"column2\"]), axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"affiliations = affiliations.merge(univ_locations, on=record_col)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
2 years ago
{
"data": {
2 years ago
"text/plain": " UT (Unique WOS ID) Affiliations \n0 WOS:000209536100003 NATURAL HISTORY MUSEUM LONDON \\\n1 WOS:000209536100003 NATURAL HISTORY MUSEUM LONDON \n2 WOS:000209536100003 NATURAL HISTORY MUSEUM LONDON \n3 WOS:000209536100003 NATURAL HISTORY MUSEUM LONDON \n4 WOS:000209536100003 NATURAL HISTORY MUSEUM LONDON \n\n Affiliations_merged \n0 NATURAL HISTORY MUSEUM LONDON \\\n1 NATURAL HISTORY MUSEUM LONDON \n2 NATURAL HISTORY MUSEUM LONDON \n3 NATURAL HISTORY MUSEUM LONDON \n4 NATURAL HISTORY MUSEUM LONDON \n\n Address Country \n0 BGI HK Ltd, GigaSci, Tai Po, Hong Kong, Peopl... China \\\n1 Nat Hist Museum, London SW7 5BD, England; United Kingdom \n2 Pensoft Publishers, Sofia, Bulgaria; Bulgaria \n3 Nat Hist Museum, Natl Museum, Sofia, Bulgaria; Bulgaria \n4 Bulgarian Acad Sci, Inst Biodivers & Ecosyst ... Bulgaria \n\n City Country_Type Institution levehnstein \n0 Hong Kong China BGI HK LTD 24 \n1 London Other NAT HIST MUSEUM 14 \n2 Sofia EU PENSOFT PUBLISHERS 25 \n3 Sofia EU NAT HIST MUSEUM 14 \n4 Rees EU BULGARIAN ACAD SCI 25 ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Affiliations</th>\n <th>Affiliations_merged</th>\n <th>Address</th>\n <th>Country</th>\n <th>City</th>\n <th>Country_Type</th>\n <th>Institution</th>\n <th>levehnstein</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000209536100003</td>\n <td>NATURAL HISTORY MUSEUM LONDON</td>\n <td>NATURAL HISTORY MUSEUM LONDON</td>\n <td>BGI HK Ltd, GigaSci, Tai Po, Hong Kong, Peopl...</td>\n <td>China</td>\n <td>Hong Kong</td>\n <td>China</td>\n <td>BGI HK LTD</td>\n <td>24</td>\n </tr>\n <tr>\n <th>1</th>\n <td>WOS:000209536100003</td>\n <td>NATURAL HISTORY MUSEUM LONDON</td>\n <td>NATURAL HISTORY MUSEUM LONDON</td>\n <td>Nat Hist Museum, London SW7 5BD, England;</td>\n <td>United Kingdom</td>\n <td>London</td>\n <td>Other</td>\n <td>NAT HIST MUSEUM</td>\n <td>14</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000209536100003</td>\n <td>NATURAL HISTORY MUSEUM LONDON</td>\n <td>NATURAL HISTORY MUSEUM LONDON</td>\n <td>Pensoft Publishers, Sofia, Bulgaria;</td>\n <td>Bulgaria</td>\n <td>Sofia</td>\n <td>EU</td>\n <td>PENSOFT PUBLISHERS</td>\n <td>25</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000209536100003</td>\n <td>NATURAL HISTORY MUSEUM LONDON</td>\n <td>NATURAL HISTORY MUSEUM LONDON</td>\n <td>Nat Hist Museum, Natl Museum, Sofia, Bulgaria;</td>\n <td>Bulgaria</td>\n <td>Sofia</td>\n <td>EU</td>\n <td>NAT HIST MUSEUM</td>\n <td>14</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000209536100003</td>\n <td>NATURAL HISTORY MUSEUM LONDON</td>\n <td>NATURAL HISTORY MUSEUM LONDON</td>\n <td>Bulgarian Acad Sci, Inst Biodivers &amp; Ecosyst ...</td>\n <td>Bulgaria</td>\n <td>Rees</td>\n <td>EU</td>\n <td>BULGARIAN ACAD SCI</td>\n <td>25</td>\n </tr>\n </tbody>\n</table>\n</div>"
2 years ago
},
2 years ago
"execution_count": 26,
2 years ago
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2 years ago
"affiliations[\"Affiliations\"] = affiliations[\"Affiliations\"].str.upper().str.strip()\n",
"affiliations[\"Institution\"] = affiliations[\"Institution\"].str.upper().str.strip()\n",
"\n",
"affiliations[\"levehnstein\"] = affiliations.apply(\n",
" lambda x: edit_distance(x[\"Affiliations\"], x[\"Institution\"]), axis=1)\n",
"affiliations.head()"
]
},
{
"cell_type": "code",
"execution_count": 71,
"outputs": [],
"source": [
"def tok_overlap(lon_str, short_str):\n",
" l,s = lon_str.split(\" \"), short_str.split(\" \")\n",
" # create a pairwise distance matrix using NumPy\n",
" distance_matrix = np.fromfunction(np.vectorize(lambda i, j: edit_distance(l[int(i)], s[int(j)])), shape=(len(l), len(s)))\n",
" distance_frame = pd.DataFrame(data=distance_matrix, columns=s, index=l)\n",
"\n",
" return min(distance_frame.min().sum(),distance_frame.T.min().sum())\n",
"\n",
"# lon=(\"UNIVERSITY\",\"AMSTERDAM\",\"TECHNICAL\", \"LOCAL\")\n",
"# sho=(\"UNIV\",\"AMSTER\",\"TECH\",\"LOCAL\")\n",
"# tok_overlap(lon_str=\" \".join(lon),short_str=\" \".join(sho)).min().sum()"
2 years ago
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
2 years ago
"execution_count": 62,
2 years ago
"outputs": [
{
2 years ago
"data": {
"text/plain": "(4, 3)"
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
2 years ago
}
],
"source": [
2 years ago
"tok_overlap(lon_str=\" \".join(l),short_str=\" \".join(s)).shape"
2 years ago
],
"metadata": {
"collapsed": false
}
2 years ago
},
{
"cell_type": "code",
1 year ago
"execution_count": 72,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Affiliations \n0 WOS:000209536100003 NATURAL HISTORY MUSEUM LONDON \\\n1 WOS:000209536100003 NATURAL HISTORY MUSEUM LONDON \n2 WOS:000209536100003 NATURAL HISTORY MUSEUM LONDON \n3 WOS:000209536100003 NATURAL HISTORY MUSEUM LONDON \n4 WOS:000209536100003 NATURAL HISTORY MUSEUM LONDON \n\n Affiliations_merged \n0 NATURAL HISTORY MUSEUM LONDON \\\n1 NATURAL HISTORY MUSEUM LONDON \n2 NATURAL HISTORY MUSEUM LONDON \n3 NATURAL HISTORY MUSEUM LONDON \n4 NATURAL HISTORY MUSEUM LONDON \n\n Address Country \n0 BGI HK Ltd, GigaSci, Tai Po, Hong Kong, Peopl... China \\\n1 Nat Hist Museum, London SW7 5BD, England; United Kingdom \n2 Pensoft Publishers, Sofia, Bulgaria; Bulgaria \n3 Nat Hist Museum, Natl Museum, Sofia, Bulgaria; Bulgaria \n4 Bulgarian Acad Sci, Inst Biodivers & Ecosyst ... Bulgaria \n\n City Country_Type Institution levehnstein token_overlap \n0 Hong Kong China BGI HK LTD 24 16 \n1 London Other NAT HIST MUSEUM 14 7 \n2 Sofia EU PENSOFT PUBLISHERS 25 12 \n3 Sofia EU NAT HIST MUSEUM 14 7 \n4 Rees EU BULGARIAN ACAD SCI 25 17 ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Affiliations</th>\n <th>Affiliations_merged</th>\n <th>Address</th>\n <th>Country</th>\n <th>City</th>\n <th>Country_Type</th>\n <th>Institution</th>\n <th>levehnstein</th>\n <th>token_overlap</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000209536100003</td>\n <td>NATURAL HISTORY MUSEUM LONDON</td>\n <td>NATURAL HISTORY MUSEUM LONDON</td>\n <td>BGI HK Ltd, GigaSci, Tai Po, Hong Kong, Peopl...</td>\n <td>China</td>\n <td>Hong Kong</td>\n <td>China</td>\n <td>BGI HK LTD</td>\n <td>24</td>\n <td>16</td>\n </tr>\n <tr>\n <th>1</th>\n <td>WOS:000209536100003</td>\n <td>NATURAL HISTORY MUSEUM LONDON</td>\n <td>NATURAL HISTORY MUSEUM LONDON</td>\n <td>Nat Hist Museum, London SW7 5BD, England;</td>\n <td>United Kingdom</td>\n <td>London</td>\n <td>Other</td>\n <td>NAT HIST MUSEUM</td>\n <td>14</td>\n <td>7</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000209536100003</td>\n <td>NATURAL HISTORY MUSEUM LONDON</td>\n <td>NATURAL HISTORY MUSEUM LONDON</td>\n <td>Pensoft Publishers, Sofia, Bulgaria;</td>\n <td>Bulgaria</td>\n <td>Sofia</td>\n <td>EU</td>\n <td>PENSOFT PUBLISHERS</td>\n <td>25</td>\n <td>12</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000209536100003</td>\n <td>NATURAL HISTORY MUSEUM LONDON</td>\n <td>NATURAL HISTORY MUSEUM LONDON</td>\n <td>Nat Hist Museum, Natl Museum, Sofia, Bulgaria;</td>\n <td>Bulgaria</td>\n <td>Sofia</td>\n <td>EU</td>\n <td>NAT HIST MUSEUM</td>\n <td>14</td>\n <td>7</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000209536100003</td>\n <td>NATURAL HISTORY MUSEUM LONDON</td>\n <td>NATURAL HISTORY MUSEUM LONDON</td>\n <td>Bulgarian Acad Sci, Inst Biodivers &amp; Ecosyst ...</td>\n <td>Bulgaria</td>\n <td>Rees</td>\n <td>EU</td>\n <td>BULGARIAN ACAD SCI</td>\n <td>25</td>\n <td>17</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
2 years ago
"source": [
"affiliations[\"token_overlap\"] = affiliations.apply(\n",
" lambda x: tok_overlap(x[\"Affiliations\"], x[\"Institution\"]), axis=1)\n",
"affiliations.head()"
],
"metadata": {
1 year ago
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 73,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Affiliations \n2430154 WOS:000947693400001 UNIVERSITAT POLITECNICA DE VALENCIA \\\n2430132 WOS:000947693400001 SHANGHAITECH UNIVERSITY \n2430139 WOS:000947693400001 SHANGHAI OCEAN UNIVERSITY \n2430146 WOS:000947693400001 SHANGHAI JIAO TONG UNIVERSITY \n2430125 WOS:000947693400001 HUZHOU UNIVERSITY \n... ... ... \n43 WOS:000301090100061 BIRKBECK UNIVERSITY LONDON \n13 WOS:000297893800037 UNIVERSIDAD POLITECNICA DE MADRID \n11 WOS:000297893800037 BEIJING INSTITUTE OF TECHNOLOGY \n1 WOS:000209536100003 NATURAL HISTORY MUSEUM LONDON \n9 WOS:000209536100003 BULGARIAN ACADEMY OF SCIENCES \n\n Affiliations_merged \n2430154 UNIVERSITAT POLITECNICA DE VALENCIA \\\n2430132 SHANGHAITECH UNIVERSITY \n2430139 SHANGHAI UNIVERSITY \n2430146 SHANGHAI UNIVERSITY \n2430125 HUZHOU UNIVERSITY \n... ... \n43 BIRKBECK UNIVERSITY LONDON \n13 UNIVERSIDAD POLITECNICA DE MADRID \n11 BEIJING INSTITUTE OF TECHNOLOGY \n1 NATURAL HISTORY MUSEUM LONDON \n9 BULGARIAN ACADEMY OF SCIENCES \n\n Address Country \n2430154 Univ Politecn Valencia, European Inst Innovat... Spain \\\n2430132 ShanghaiTech Univ, Shanghai Inst Adv Immunoch... China \n2430139 Shanghai Ocean Univ, Coll Fisheries & Life Sc... China \n2430146 Shanghai Jiao Tong Univ, Ruijin Hosp, Sch Med... China \n2430125 Huzhou Univ, Sch Informat Engn, Huzhou 313000... China \n... ... ... \n43 Birkbeck Coll London, Sch Psychol, London, En... United Kingdom \n13 UPM, Ctr Elect Ind, Madrid 28006, Spain Spain \n11 UPM, Ctr Elect Ind, Madrid 28006, Spain Spain \n1 Nat Hist Museum, London SW7 5BD, England; United Kingdom \n9 Bulgarian Acad Sci, Inst Biodivers & Ecosyst ... Bulgaria \n\n City Country_Type Institution levehnstein \n2430154 Valencia EU UNIV POLITECN VALENCIA 13 \\\n2430132 Shanghai China SHANGHAITECH UNIV 6 \n2430139 Shanghai China SHANGHAI OCEAN UNIV 6 \n2430146 Meda China SHANGHAI JIAO TONG UNIV 6 \n2430125 Huzhou China HUZHOU UNIV 6 \n... ... ... ... ... \n43 London Other BIRKBECK COLL LONDON 10 \n13 Madrid EU UPM 30 \n11 Madrid EU UPM 30 \n1 London Other NAT HIST MUSEUM 14 \n9 Rees EU BULGARIAN ACAD SCI 11 \n\n token_overlap \n2430154 7 \n2430132 6 \n2430139 5 \n2430146 4 \n2430125 5 \n... ... \n43 5 \n13 3 \n11 3 \n1 7 \n9 6 \n\n[63590 rows x 10 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Affiliations</th>\n <th>Affiliations_merged</th>\n <th>Address</th>\n <th>Country</th>\n <th>City</th>\n <th>Country_Type</th>\n <th>Institution</th>\n <th>levehnstein</th>\n <th>token_overlap</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>2430154</th>\n <td>WOS:000947693400001</td>\n <td>UNIVERSITAT POLITECNICA DE VALENCIA</td>\n <td>UNIVERSITAT POLITECNICA DE VALENCIA</td>\n <td>Univ Politecn Valencia, European Inst Innovat...</td>\n <td>Spain</td>\n <td>Valencia</td>\n <td>EU</td>\n <td>UNIV POLITECN VALENCIA</td>\n <td>13</td>\n <td>7</td>\n </tr>\n <tr>\n <th>2430132</th>\n <td>WOS:000947693400001</td>\n <td>SHANGHAITECH UNIVERSITY</td>\n <td>SHANGHAITECH UNIVERSITY</td>\n <td>ShanghaiTech Univ, Shanghai Inst Adv Immunoch...</td>\n <td>China</td>\n <td>Shanghai</td>\n <td>China</td>\n <td>SHANGHAITECH UNIV</td>\n <td>6</td>\n <td>6</td>\n </tr>\n <tr>\n <th>2430139</th>\n <td>WOS:000947693400001</td>\n <td>SHANGHAI OCEAN UNIVERSITY</td>\n <td>SHANGHAI UNIVERSITY</td>\n <td>Shanghai Ocean Univ, Coll Fisheries &amp; Life Sc...</td>\n <td>China</td>\n <td>Shanghai</td>\n <td>China</td>\n <td>SHANGHAI OCEAN UNIV</td>\n <td>6</td>\n <td>5</td>\n </tr>\n <tr>\n <th>2430146</th>\n <td>WOS:000947693400001</td>\n <td>SHANGHAI JIAO TONG UNIVERSITY</td>\n <td>SHANGHAI UNIVERSITY</td>\n <td>Shanghai Jiao Tong Univ, Ruijin Hosp, Sch Med...</td>\n <td>China</td>\n <td>Meda</td>\n <td>China</td>\n <td>SHANGHAI JIAO TONG UNIV</td>\n <td>6</td>\n <td>4</td>\n </tr>\n <tr>\n <th>2430125</th>\n <td>WOS:000947693400001</td>\n <td>HUZHOU UNIVERSITY</td>\n <td>HUZHOU UNIVERSITY</td>\n <td>Huzhou Univ, Sch Informat Engn, Huzhou 313000...</td>\n <td>China</td>\n <td>Huzhou</td>\n <td>China</td>\n <td>HUZHOU UNIV</td>\n <td>6</td>\n <td>5</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>43</th>\n <td>WOS:000301090100061</td>\n <td>BIRKBECK UNIVERSITY LONDON</td>\n <td>BIRKBECK UNIVERSITY LONDON</td>\n <td>Birkbeck Coll London, Sch Psychol, London, En...</td>\n <td>United Kingdom</td>\n <td>London</td>\n <td>Other</td>\n <td>BIRKBECK COLL LONDON</td>\n <td>10</td>\n <td>5</td>\n </tr>\n <tr>\n <th>13</th>\n <td>WOS:000297893800037</td>\n <td>UNIVERSIDAD POLITECNICA DE MADRID</td>\n <td>UNIVERSIDAD POLITECNICA DE MADRID</td>\n <td>UPM, Ctr Elect Ind, Madrid 28006, Spain</td>\n <td>Spain</td>\n <td>Madrid</td>\n <td>EU</td>\n <td>UPM</td>\n <td>30</td>\n <td>3</td>\n </tr>\n <tr>\n <th>11</th>\n <td>WOS:000297893800037</td>\n <td>BEIJING INSTITUTE OF TECHNOLOGY</td>\n <td>BEIJING INSTITUTE OF TECHNOLOGY</td>\n <td>UPM, Ctr Elect Ind, Madrid 28006, Spain</td>\n <td>Spain</td>\n <td>Madrid</td>\n <td>EU</td>\n <td>UPM</td>\n <td>30</td>\n <td>3</td>\n </tr>\n <tr>\n <th>1</th>\n <td>WOS:000209536100003</td>\n <td>NATURAL HISTORY MUSEUM LONDON</td>\n <td>NATURAL HISTORY MUSEUM LONDON</td>\n <td>Nat Hist Museum, London SW7 5BD, England;</td>\n <td>United Kingdom<
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"affiliations.sort_values(by=[record_col,\"Affiliations\",\"token_overlap\"], ascending=[False,False,True]).drop_duplicates(subset=[record_col,\"Affiliations\"])"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 80,
"outputs": [
{
"data": {
"text/plain": "Affiliations\n(ADVENTHEALTH) CENTRAL FLORIDA DIVISION CHARITE\n1 DECEMBRIE 1918 UNIVERSITY ALBA IULIA 1 DECEMBRIE 1918 UNIV ALBA IULIA\nA*STAR - BIOINFORMATICS INSTITUTE (BII) ASTAR\nA*STAR - GENOME INSTITUTE OF SINGAPORE (GIS) UNIV COPENHAGEN\nA*STAR - INSTITUTE FOR INFOCOMM RESEARCH (I2R) ASTAR\n ... \nZTE ZTE CORP\nZUNYI MEDICAL UNIVERSITY [JINAN UNIV, NCI, SANOFI]\nZURICH CENTER INTEGRATIVE HUMAN PHYSIOLOGY (ZIHP) UNIV ZURICH\nZURICH UNIVERSITY OF APPLIED SCIENCES [IRD, SAS, UCL]\nZUSE INSTITUTE BERLIN ZUSE INST BERLIN\nName: Institution, Length: 4884, dtype: object"
},
"execution_count": 80,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"helper = affiliations.sort_values(by=[\"Affiliations\",\"token_overlap\"], ascending=[False,True]).drop_duplicates(subset=[record_col,\"Affiliations\"])\n",
"afh = helper[[\"Affiliations\",\"Institution\",\"Country\"]]\n",
"afh.groupby(\"Affiliations\")[\"Institution\"].agg(pd.Series.mode)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 82,
"outputs": [
{
"data": {
"text/plain": "Affiliations\n(ADVENTHEALTH) CENTRAL FLORIDA DIVISION Germany\n1 DECEMBRIE 1918 UNIVERSITY ALBA IULIA Romania\nA*STAR - BIOINFORMATICS INSTITUTE (BII) Singapore\nA*STAR - GENOME INSTITUTE OF SINGAPORE (GIS) Denmark\nA*STAR - INSTITUTE FOR INFOCOMM RESEARCH (I2R) Singapore\n ... \nZTE China\nZUNYI MEDICAL UNIVERSITY United States\nZURICH CENTER INTEGRATIVE HUMAN PHYSIOLOGY (ZIHP) Switzerland\nZURICH UNIVERSITY OF APPLIED SCIENCES [France, United Kingdom, United States]\nZUSE INSTITUTE BERLIN Germany\nName: Country, Length: 4884, dtype: object"
},
"execution_count": 82,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"afh.groupby(\"Affiliations\")[\"Country\"].agg(pd.Series.mode)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 92,
"outputs": [],
"source": [
"helper1 = affiliations.sort_values(by=[\"Affiliations\",\"token_overlap\"], ascending=[False,True]).drop_duplicates(subset=[record_col,\"Affiliations\"])\n",
"afh1 = helper1[[\"Affiliations\",\"Institution\",\"City\",\"Country\",\"Country_Type\"]]\n",
"mode1_i = afh1.groupby(\"Affiliations\")[\"Institution\"].apply(pd.Series.mode).reset_index()\n",
"mode1_c = afh1.groupby(\"Affiliations\")[\"Country\"].apply(pd.Series.mode).reset_index()\n",
"mode1_city = afh1.groupby(\"Affiliations\")[\"City\"].apply(pd.Series.mode).reset_index()\n",
"mode1_type = afh1.groupby(\"Affiliations\")[\"Country_Type\"].apply(pd.Series.mode).reset_index()\n",
"\n",
"helper2 = affiliations.sort_values(by=[\"Affiliations\",\"levehnstein\"], ascending=[False,True]).drop_duplicates(subset=[record_col,\"Affiliations\"])\n",
"afh2 = helper2[[\"Affiliations\",\"Institution\",\"City\",\"Country\",\"Country_Type\"]]\n",
"mode2_i = afh2.groupby(\"Affiliations\")[\"Institution\"].apply(pd.Series.mode).reset_index()\n",
"mode2_c = afh2.groupby(\"Affiliations\")[\"Country\"].apply(pd.Series.mode).reset_index()\n",
"mode2_city = afh2.groupby(\"Affiliations\")[\"City\"].apply(pd.Series.mode).reset_index()\n",
"mode2_type = afh2.groupby(\"Affiliations\")[\"Country_Type\"].apply(pd.Series.mode).reset_index()\n",
"\n",
"mode_i = pd.concat([mode1_i,mode2_i],ignore_index=True)[[\"Affiliations\",\"Institution\"]].groupby(\"Affiliations\")[\"Institution\"].agg(\n",
" lambda x: pd.Series.mode(x)[0])\n",
"mode_c = pd.concat([mode1_c,mode2_c],ignore_index=True)[[\"Affiliations\",\"Country\"]].groupby(\"Affiliations\")[\"Country\"].agg(\n",
" lambda x: pd.Series.mode(x)[0])\n",
"mode_city = pd.concat([mode1_city,mode2_city],ignore_index=True)[[\"Affiliations\",\"City\"]].groupby(\"Affiliations\")[\"City\"].agg(\n",
" lambda x: pd.Series.mode(x)[0])\n",
"mode_type = pd.concat([mode1_type,mode2_type],ignore_index=True)[[\"Affiliations\",\"Country_Type\"]].groupby(\"Affiliations\")[\"Country_Type\"].agg(\n",
" lambda x: pd.Series.mode(x)[0])"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 99,
"outputs": [
{
"data": {
"text/plain": " Affiliations \n0 (ADVENTHEALTH) CENTRAL FLORIDA DIVISION \\\n1 1 DECEMBRIE 1918 UNIVERSITY ALBA IULIA \n2 A*STAR - BIOINFORMATICS INSTITUTE (BII) \n3 A*STAR - GENOME INSTITUTE OF SINGAPORE (GIS) \n4 A*STAR - INSTITUTE FOR INFOCOMM RESEARCH (I2R) \n... ... \n4795 ZTE \n4796 ZUNYI MEDICAL UNIVERSITY \n4797 ZURICH CENTER INTEGRATIVE HUMAN PHYSIOLOGY (ZIHP) \n4798 ZURICH UNIVERSITY OF APPLIED SCIENCES \n4799 ZUSE INSTITUTE BERLIN \n\n Institution (short name from address) Country_candidate City_candidate \n0 CHARITE Canada Berlin \\\n1 1 DECEMBRIE 1918 UNIV ALBA IULIA Romania Alba Iulia \n2 ASTAR China Jinan \n3 AGCY SCI TECHNOL & RES Denmark Copenhagen \n4 ASTAR Singapore Rees \n... ... ... ... \n4795 ZTE CORP China Shenzhen \n4796 JINAN UNIV China Bethesda \n4797 NATL CTR EXCELLENCE YOUTH MENTAL HLTH Switzerland Zürich \n4798 IRD France Cary \n4799 ZUSE INST BERLIN Germany Berlin \n\n Country_type_candidate \n0 EU \n1 EU \n2 China \n3 EU \n4 Other \n... ... \n4795 China \n4796 China \n4797 Other \n4798 Other \n4799 EU \n\n[4800 rows x 5 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Affiliations</th>\n <th>Institution (short name from address)</th>\n <th>Country_candidate</th>\n <th>City_candidate</th>\n <th>Country_type_candidate</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>(ADVENTHEALTH) CENTRAL FLORIDA DIVISION</td>\n <td>CHARITE</td>\n <td>Canada</td>\n <td>Berlin</td>\n <td>EU</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1 DECEMBRIE 1918 UNIVERSITY ALBA IULIA</td>\n <td>1 DECEMBRIE 1918 UNIV ALBA IULIA</td>\n <td>Romania</td>\n <td>Alba Iulia</td>\n <td>EU</td>\n </tr>\n <tr>\n <th>2</th>\n <td>A*STAR - BIOINFORMATICS INSTITUTE (BII)</td>\n <td>ASTAR</td>\n <td>China</td>\n <td>Jinan</td>\n <td>China</td>\n </tr>\n <tr>\n <th>3</th>\n <td>A*STAR - GENOME INSTITUTE OF SINGAPORE (GIS)</td>\n <td>AGCY SCI TECHNOL &amp; RES</td>\n <td>Denmark</td>\n <td>Copenhagen</td>\n <td>EU</td>\n </tr>\n <tr>\n <th>4</th>\n <td>A*STAR - INSTITUTE FOR INFOCOMM RESEARCH (I2R)</td>\n <td>ASTAR</td>\n <td>Singapore</td>\n <td>Rees</td>\n <td>Other</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>4795</th>\n <td>ZTE</td>\n <td>ZTE CORP</td>\n <td>China</td>\n <td>Shenzhen</td>\n <td>China</td>\n </tr>\n <tr>\n <th>4796</th>\n <td>ZUNYI MEDICAL UNIVERSITY</td>\n <td>JINAN UNIV</td>\n <td>China</td>\n <td>Bethesda</td>\n <td>China</td>\n </tr>\n <tr>\n <th>4797</th>\n <td>ZURICH CENTER INTEGRATIVE HUMAN PHYSIOLOGY (ZIHP)</td>\n <td>NATL CTR EXCELLENCE YOUTH MENTAL HLTH</td>\n <td>Switzerland</td>\n <td>Zürich</td>\n <td>Other</td>\n </tr>\n <tr>\n <th>4798</th>\n <td>ZURICH UNIVERSITY OF APPLIED SCIENCES</td>\n <td>IRD</td>\n <td>France</td>\n <td>Cary</td>\n <td>Other</td>\n </tr>\n <tr>\n <th>4799</th>\n <td>ZUSE INSTITUTE BERLIN</td>\n <td>ZUSE INST BERLIN</td>\n <td>Germany</td>\n <td>Berlin</td>\n <td>EU</td>\n </tr>\n </tbody>\n</table>\n<p>4800 rows × 5 columns</p>\n</div>"
},
"execution_count": 99,
"metadata": {},
"output_type": "execute_result"
2 years ago
}
1 year ago
],
"source": [
"from functools import reduce\n",
"dfs = [mode_i, mode_c, mode_city, mode_type]\n",
"mode_final = reduce(lambda left,right: pd.merge(left,right,on='Affiliations'), dfs)\n",
"mode_final = mode_final.reset_index()\n",
"mode_final.columns = [\"Affiliations\",\"Institution (short name from address)\",\"Country_candidate\",\"City_candidate\",\"Country_type_candidate\"]\n",
"mode_final"
],
"metadata": {
"collapsed": false
2 years ago
}
},
{
"cell_type": "code",
"execution_count": 40,
"outputs": [
{
"data": {
"text/plain": " Affiliations \n1873185 (ADVENTHEALTH) CENTRAL FLORIDA DIVISION \\\n1873299 (ADVENTHEALTH) CENTRAL FLORIDA DIVISION \n1873346 (ADVENTHEALTH) CENTRAL FLORIDA DIVISION \n1873394 (ADVENTHEALTH) CENTRAL FLORIDA DIVISION \n1873170 (ADVENTHEALTH) CENTRAL FLORIDA DIVISION \n... ... \n715405 ZUSE INSTITUTE BERLIN \n1548143 ZUSE INSTITUTE BERLIN \n715403 ZUSE INSTITUTE BERLIN \n1548154 ZUSE INSTITUTE BERLIN \n715409 ZUSE INSTITUTE BERLIN \n\n Institution levehnstein \n1873185 ST JOSEPHS HLTH CARE LONDON 28 \n1873299 ATHENS NAVAL & VET HOSP 28 \n1873346 ASST VALCAMONICA OSPED ESINE 28 \n1873394 ASST VALTELLINA & ALTO LARIO 28 \n1873170 FUNDACAO CTR MED CAMPINAS 29 \n... ... ... \n715405 CARL VON OSSIETZKY UNIV OLDENBURG 25 \n1548143 CHONGQING UNIV POSTS & TELECOMMUN 26 \n715403 GERMAN CTR NEURODEGENRAT DIS DZNE 27 \n1548154 UNIV KLINIKUM SCHLESWIG HOLSTEIN KIEL 30 \n715409 INESC TEC INST ENGN SISTEMAS & COMP TECNOL & CIEN 35 \n\n[773544 rows x 3 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Affiliations</th>\n <th>Institution</th>\n <th>levehnstein</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>1873185</th>\n <td>(ADVENTHEALTH) CENTRAL FLORIDA DIVISION</td>\n <td>ST JOSEPHS HLTH CARE LONDON</td>\n <td>28</td>\n </tr>\n <tr>\n <th>1873299</th>\n <td>(ADVENTHEALTH) CENTRAL FLORIDA DIVISION</td>\n <td>ATHENS NAVAL &amp; VET HOSP</td>\n <td>28</td>\n </tr>\n <tr>\n <th>1873346</th>\n <td>(ADVENTHEALTH) CENTRAL FLORIDA DIVISION</td>\n <td>ASST VALCAMONICA OSPED ESINE</td>\n <td>28</td>\n </tr>\n <tr>\n <th>1873394</th>\n <td>(ADVENTHEALTH) CENTRAL FLORIDA DIVISION</td>\n <td>ASST VALTELLINA &amp; ALTO LARIO</td>\n <td>28</td>\n </tr>\n <tr>\n <th>1873170</th>\n <td>(ADVENTHEALTH) CENTRAL FLORIDA DIVISION</td>\n <td>FUNDACAO CTR MED CAMPINAS</td>\n <td>29</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>715405</th>\n <td>ZUSE INSTITUTE BERLIN</td>\n <td>CARL VON OSSIETZKY UNIV OLDENBURG</td>\n <td>25</td>\n </tr>\n <tr>\n <th>1548143</th>\n <td>ZUSE INSTITUTE BERLIN</td>\n <td>CHONGQING UNIV POSTS &amp; TELECOMMUN</td>\n <td>26</td>\n </tr>\n <tr>\n <th>715403</th>\n <td>ZUSE INSTITUTE BERLIN</td>\n <td>GERMAN CTR NEURODEGENRAT DIS DZNE</td>\n <td>27</td>\n </tr>\n <tr>\n <th>1548154</th>\n <td>ZUSE INSTITUTE BERLIN</td>\n <td>UNIV KLINIKUM SCHLESWIG HOLSTEIN KIEL</td>\n <td>30</td>\n </tr>\n <tr>\n <th>715409</th>\n <td>ZUSE INSTITUTE BERLIN</td>\n <td>INESC TEC INST ENGN SISTEMAS &amp; COMP TECNOL &amp; CIEN</td>\n <td>35</td>\n </tr>\n </tbody>\n</table>\n<p>773544 rows × 3 columns</p>\n</div>"
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"aff_lookup = affiliations[[\"Affiliations\",\"Institution\",\"levehnstein\"]].drop_duplicates().sort_values(by=[\"Affiliations\",\"levehnstein\"],ascending=[True,True])\n",
"aff_lookup"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 42,
"outputs": [
{
"data": {
"text/plain": "[['THERESIENKRANKENHAUS'],\n ['CHARITE'],\n ['SALAMAH'],\n ['ASTAR'],\n ['INSERM'],\n ['CNRS'],\n ['MIT'],\n ['CNPQ'],\n ['UNICEN'],\n ['IRCCS'],\n ['NEI'],\n ['UCL'],\n ['UESTC'],\n ['SOVECTRON'],\n ['NTENT'],\n ['IEEE'],\n ['QMUL'],\n ['LCA'],\n ['CALTECH'],\n ['EUROFUSION'],\n ['LIWFUSION'],\n ['CIEMAT'],\n ['UNED'],\n ['ZALANDO'],\n ['VIIT'],\n ['CUNY'],\n ['KIIT'],\n ['USTC'],\n ['ASIPP'],\n ['ORISE'],\n ['IET'],\n ['AAIA'],\n ['TRAP'],\n ['CSIC'],\n ['ESAC'],\n ['ESTEC'],\n ['SISSA'],\n ['CERN'],\n ['IRFM'],\n ['NOKIA'],\n ['BUPT'],\n ['JET'],\n ['NIH'],\n ['MICROSOFT'],\n ['METU'],\n ['RIKEN'],\n ['QST'],\n ['DIFFER'],\n ['CEFCA'],\n ['ULL'],\n ['INFN'],\n ['IUCAA'],\n ['BCS'],\n ['KTH'],\n ['CRPP'],\n ['CEA'],\n ['ULB'],\n ['CCFE'],\n ['COMPX'],\n ['HKUST'],\n ['UNSW'],\n ['IEEC'],\n ['AMAZON'],\n ['IPSL'],\n ['IRD'],\n ['RAS'],\n ['CIRAD'],\n ['CREAF'],\n ['NYU'],\n ['EPFL'],\n ['UPMC'],\n ['UAM'],\n ['NTNU'],\n ['ABBVIE'],\n ['GLAXOSMITHKLINE'],\n ['BELANGER-CHAMPAGNE'],\n ['AHARROUCHE'],\n ['BAHMANI'],\n ['COUTINHO'],\n ['ICREA'],\n ['BELLOMO'],\n ['AKESSON'],\n ['UCAS'],\n ['LPTPM'],\n ['CINDRO'],\n ['AKATSUKA'],\n ['AHMADOV'],\n ['DESY'],\n ['AOUN'],\n ['IN2P3'],\n ['IFAE'],\n ['CHEN'],\n ['AOKI'],\n ['AAD'],\n ['KEK'],\n ['FIAS'],\n ['HBNI'],\n ['CAS'],\n ['GRANDITUDE'],\n ['CSIRO'],\n ['NHGRI'],\n ['NPR'],\n ['ACECR'],\n ['MRC'],\n ['ORYGEN'],\n ['NEUROSKETCH'],\n ['JANCSITECH'],\n ['CNR'],\n ['BAINBRIDGE'],\n ['NICPB'],\n ['NIKITENKO'],\n ['IISER'],\n ['IPN'],\n ['BELL'],\n ['PATH'],\n ['WHO'],\n ['CHALEARN'],\n ['4PARADIGM'],\n ['CORNELL'],\n ['INRIA'],\n ['ANU'],\n ['USC'],\n ['CMU'],\n ['UIUC'],\n ['EMORY'],\n ['ABDULLIN'],\n ['ACOSTA'],\n ['CUMALAT'],\n ['AMIN'],\n ['BRANSON'],\n ['BELYAEV'],\n ['ETH'],\n ['NSU'],\n ['NTU'],\n ['COUBEZ'],\n ['INPP'],\n ['OEAW'],\n ['GAPPS'],\n ['TNO'],\n ['MEDTRONIC'],\n ['METEOSWISS'],\n ['ASTRON'],\n ['INAF'],\n ['ESA'],\n ['IFPU'],\n ['INTERDIGITAL'],\n ['QCAT'],\n ['AETHERAI'],\n ['AGRESEARCH'],\n ['ARS'],\n ['CICAPS'],\n ['INRA'],\n ['AGROPARISTECH'],\n ['AGROSCOPE'],\n ['UAB'],\n ['MAGELLIUM'],\n ['IFREMER'],\n ['AIRBUS'],\n ['EUMETSAT'],\n ['CINVESTAV'],\n ['AREEO'],\n ['NOVELTIS'],\n ['NERSC'],\n ['IRRCS'],\n ['ISRO'],\n ['CNES'],\n ['JAMSTEC'],\n ['CIMA'],\n ['UNAM'],\n ['SOCIB'],\n ['CLS'],\n ['PAS'],\n ['OCEANDATALAB'],\n ['LLC'],\n ['NIKHEF'],\n ['TIFR'],\n ['CAFPE'],\n ['ECMWF'],\n ['SATOC'],\n ['NOAA'],\n ['CPRM'],\n ['SHOM'],\n ['DAIM'],\n ['UTM'],\n ['NIA'],\n ['POSTECH'],\n ['DBRAIN'],\n ['GIANTAI'],\n ['ISCAS'],\n ['GOOGLE'],\n ['INTEL'],\n ['SOARTECH'],\n ['NNAISENSE'],\n ['OOSTO'],\n ['HUMINTEC'],\n ['CAPSS'],\n ['ISS'],\n ['JINR'],\n ['AMU'],\n ['HIP'],\n ['IKERBASQUE'],\n ['TRIUMF'],\n ['SNOLAB'],\n ['TUNL'],\n ['NASU'],\n ['ZILLOW'],\n ['OICR'],\n ['NCI'],\n ['DOCBOT'],\n ['PRIZE4LIFE'],\n ['LINKEDIN'],\n ['NVIDIA'],\n ['SONATRACH'],\n ['NPC'],\n ['SIMATS'],\n ['CAPSBE'],\n ['CCNU'],\n ['IIT'],\n ['URCA'],\n ['SOUNDCLOUD'],\n ['LINEA'],\n ['OPROJECT'],\n ['ELSEVIER'],\n ['LSST'],\n ['SMARTMORE'],\n ['JRC'],\n ['CASTELLDEFELS'],\n ['NASA'],\n ['AREU'],\n ['CERIST'],\n ['INSIGHTS2TECHINFO'],\n ['ERICSSON'],\n ['INICSA'],\n ['PSYCHIAT'],\n ['UNICAMILLUS'],\n ['ULTROMICS'],\n ['ISSSTE'],\n ['INCMNSZ'],\n ['TQEH'],\n ['PATHAI'],\n ['WITSEE'],\n ['HOSAIO'],\n ['INNSZ'],\n ['MSGSU'],\n ['EMBL'],\n ['NIMH'],\n ['UCLA'],\n ['HUAWEI'],\n ['HUST'],\n ['OMRF'],\n ['UMCU'],\n ['PTB'],\n ['UMCL'],\n ['TUM'],\n ['UNC'],\n ['UVSQ'],\n ['PSL'],\n ['CAML'],\n ['CMC'],\n ['UMC'],\n ['GRCC'],\n ['IRIT'],\n ['IRISA'],\n ['OSUR'],\n ['UOC'],\n ['ASTRAZENECA'],\n ['BAYER'],\n ['GALIXIR'],\n ['BEIERSDORF'],\n ['AT&T'],\n ['IST'],\n ['MILA'],\n ['ACCENTURE'],\n ['1QBIT'],\n ['UGA'],\n ['UBL'],\n ['UIB'],\n ['IAE'],\n ['INAIL'],\n ['JARA'],\n ['CIBERSAM'],\n ['IDIBAPS'],\n ['QUALCOMM'],\n ['NICTA'],\n ['IAC'],\n ['BYTEDANCE'],\n ['BBC'],\n ['NUMENTA'],\n ['OROBIX'],\n ['VHIR'],\n ['INAOE'],\n ['BIST'],\n ['EURECAT'],\n ['UB'],\n ['PAB'],\n ['FAU'],\n ['ICISE'],\n ['BAIDU'],\n ['EURECOM'],\n ['BUITEMS'],\n ['BHU'],\n ['DNANEXUS'],\n ['RESIST'],\n ['I
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"aff_lookup_levehnstein = aff_lookup.copy()\n",
"aff_lookup_overlap = aff_lookup.copy()\n",
"inst_short = sorted([i.split(\" \") for i in list(aff_lookup_overlap[\"Institution\"].unique())], key=len)\n",
"inst_short"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 41,
"outputs": [
{
"data": {
"text/plain": " Affiliations \n1873185 (ADVENTHEALTH) CENTRAL FLORIDA DIVISION \\\n1939932 1 DECEMBRIE 1918 UNIVERSITY ALBA IULIA \n933680 A*STAR - BIOINFORMATICS INSTITUTE (BII) \n2257766 A*STAR - GENOME INSTITUTE OF SINGAPORE (GIS) \n2364292 A*STAR - INSTITUTE FOR INFOCOMM RESEARCH (I2R) \n... ... \n1523750 ZTE \n2032613 ZUNYI MEDICAL UNIVERSITY \n476604 ZURICH CENTER INTEGRATIVE HUMAN PHYSIOLOGY (ZIHP) \n975211 ZURICH UNIVERSITY OF APPLIED SCIENCES \n715406 ZUSE INSTITUTE BERLIN \n\n Institution levehnstein \n1873185 ST JOSEPHS HLTH CARE LONDON 28 \n1939932 1 DECEMBRIE 1918 UNIV ALBA IULIA 6 \n933680 SHANDONG NORMAL UNIV 29 \n2257766 AGCY SCI TECHNOL & RES 34 \n2364292 INST INFOCOMM RES I2R 25 \n... ... ... \n1523750 ZTE CORP 5 \n2032613 ZUNYI MED UNIV 10 \n476604 SWISS FED INST TECHNOL ZURICH 36 \n975211 ZURICH UNIV APPL SCI ZHAW 17 \n715406 ZUSE INST BERLIN 5 \n\n[4884 rows x 3 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Affiliations</th>\n <th>Institution</th>\n <th>levehnstein</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>1873185</th>\n <td>(ADVENTHEALTH) CENTRAL FLORIDA DIVISION</td>\n <td>ST JOSEPHS HLTH CARE LONDON</td>\n <td>28</td>\n </tr>\n <tr>\n <th>1939932</th>\n <td>1 DECEMBRIE 1918 UNIVERSITY ALBA IULIA</td>\n <td>1 DECEMBRIE 1918 UNIV ALBA IULIA</td>\n <td>6</td>\n </tr>\n <tr>\n <th>933680</th>\n <td>A*STAR - BIOINFORMATICS INSTITUTE (BII)</td>\n <td>SHANDONG NORMAL UNIV</td>\n <td>29</td>\n </tr>\n <tr>\n <th>2257766</th>\n <td>A*STAR - GENOME INSTITUTE OF SINGAPORE (GIS)</td>\n <td>AGCY SCI TECHNOL &amp; RES</td>\n <td>34</td>\n </tr>\n <tr>\n <th>2364292</th>\n <td>A*STAR - INSTITUTE FOR INFOCOMM RESEARCH (I2R)</td>\n <td>INST INFOCOMM RES I2R</td>\n <td>25</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>1523750</th>\n <td>ZTE</td>\n <td>ZTE CORP</td>\n <td>5</td>\n </tr>\n <tr>\n <th>2032613</th>\n <td>ZUNYI MEDICAL UNIVERSITY</td>\n <td>ZUNYI MED UNIV</td>\n <td>10</td>\n </tr>\n <tr>\n <th>476604</th>\n <td>ZURICH CENTER INTEGRATIVE HUMAN PHYSIOLOGY (ZIHP)</td>\n <td>SWISS FED INST TECHNOL ZURICH</td>\n <td>36</td>\n </tr>\n <tr>\n <th>975211</th>\n <td>ZURICH UNIVERSITY OF APPLIED SCIENCES</td>\n <td>ZURICH UNIV APPL SCI ZHAW</td>\n <td>17</td>\n </tr>\n <tr>\n <th>715406</th>\n <td>ZUSE INSTITUTE BERLIN</td>\n <td>ZUSE INST BERLIN</td>\n <td>5</td>\n </tr>\n </tbody>\n</table>\n<p>4884 rows × 3 columns</p>\n</div>"
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"aff_lookup.drop_duplicates(subset=\"Affiliations\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 39,
"outputs": [
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[1;31mKeyboardInterrupt\u001B[0m Traceback (most recent call last)",
"Cell \u001B[1;32mIn[39], line 9\u001B[0m\n\u001B[0;32m 4\u001B[0m aff_lookup \u001B[38;5;241m=\u001B[39m aff_m\u001B[38;5;241m.\u001B[39mmerge(inst_m, how\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mcross\u001B[39m\u001B[38;5;124m'\u001B[39m)\n\u001B[0;32m 6\u001B[0m \u001B[38;5;66;03m# aff_lookup[\"levehnstein\"] = aff_lookup.apply(\u001B[39;00m\n\u001B[0;32m 7\u001B[0m \u001B[38;5;66;03m# lambda x: edit_distance(x[\"Affiliations\"], x[\"Institution\"]), axis=1)\u001B[39;00m\n\u001B[1;32m----> 9\u001B[0m aff_lookup\u001B[38;5;241m.\u001B[39massign(distance\u001B[38;5;241m=\u001B[39m[\u001B[38;5;241m*\u001B[39m\u001B[38;5;28mmap\u001B[39m(edit_distance, aff_lookup\u001B[38;5;241m.\u001B[39mAffiliations, aff_lookup\u001B[38;5;241m.\u001B[39mInstitution)])\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\nltk\\metrics\\distance.py:111\u001B[0m, in \u001B[0;36medit_distance\u001B[1;34m(s1, s2, substitution_cost, transpositions)\u001B[0m\n\u001B[0;32m 109\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m s1[i \u001B[38;5;241m-\u001B[39m \u001B[38;5;241m1\u001B[39m] \u001B[38;5;241m==\u001B[39m s2[j \u001B[38;5;241m-\u001B[39m \u001B[38;5;241m1\u001B[39m]:\n\u001B[0;32m 110\u001B[0m last_right_buf \u001B[38;5;241m=\u001B[39m j\n\u001B[1;32m--> 111\u001B[0m \u001B[43m_edit_dist_step\u001B[49m\u001B[43m(\u001B[49m\n\u001B[0;32m 112\u001B[0m \u001B[43m \u001B[49m\u001B[43mlev\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 113\u001B[0m \u001B[43m \u001B[49m\u001B[43mi\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 114\u001B[0m \u001B[43m \u001B[49m\u001B[43mj\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 115\u001B[0m \u001B[43m \u001B[49m\u001B[43ms1\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 116\u001B[0m \u001B[43m \u001B[49m\u001B[43ms2\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 117\u001B[0m \u001B[43m \u001B[49m\u001B[43mlast_left\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 118\u001B[0m \u001B[43m \u001B[49m\u001B[43mlast_right\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 119\u001B[0m \u001B[43m \u001B[49m\u001B[43msubstitution_cost\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43msubstitution_cost\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 120\u001B[0m \u001B[43m \u001B[49m\u001B[43mtranspositions\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mtranspositions\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 121\u001B[0m \u001B[43m \u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 122\u001B[0m last_left_t[s1[i \u001B[38;5;241m-\u001B[39m \u001B[38;5;241m1\u001B[39m]] \u001B[38;5;241m=\u001B[39m i\n\u001B[0;32m 123\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m lev[len1][len2]\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\nltk\\metrics\\distance.py:52\u001B[0m, in \u001B[0;36m_edit_dist_step\u001B[1;34m(lev, i, j, s1, s2, last_left, last_right, substitution_cost, transpositions)\u001B[0m\n\u001B[0;32m 50\u001B[0m b \u001B[38;5;241m=\u001B[39m lev[i][j \u001B[38;5;241m-\u001B[39m \u001B[38;5;241m1\u001B[39m] \u001B[38;5;241m+\u001B[39m \u001B[38;5;241m1\u001B[39m\n\u001B[0;32m 51\u001B[0m \u001B[38;5;66;03m# substitution\u001B[39;00m\n\u001B[1;32m---> 52\u001B[0m c \u001B[38;5;241m=\u001B[39m \u001B[43mlev\u001B[49m\u001B[43m[\u001B[49m\u001B[43mi\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m-\u001B[39;49m\u001B[43m \u001B[49m\u001B[38;5;241;43m1\u001B[39;49m\u001B[43m]\u001B[49m[j \u001B[38;5;241m-\u001B[39m \u001B[38;5;241m1\u001B[39m] \u001B[38;5;241m+\u001B[39m (substitution_cost \u001B[38;5;28;01mif\u001B[39;00m c1 \u001B[38;5;241m!=\u001B[39m c2 \u001B[38;5;28;01melse\u001B[39;00m \u001B[38;5;241m0\u001B[39m)\n\u001B[0;32m 54\u001B[0m \u001B[38;5;66;03m# transposition\u001B[39;00m\n\u001B[0;32m 55\u001B[0m d \u001B[38;5;241m=\u001B[39m c \u001B[38;5;241m+\u001B[39m \u001B[38;5;241m1\u001B[39m \u001B[38;5;66;03m# never picked by default\u001B[39;00m\n",
"\u001B[1;31mKeyboardInterrupt\u001B[0m: "
]
}
],
"source": [
"# aff_m = pd.DataFrame(affiliations[\"Affiliations\"].unique(), columns=[\"Affiliations\"])\n",
"# inst_m = pd.DataFrame(affiliations[[\"Institution\",\"Country_Type\",\"Country\",\"City\"]].drop_duplicates(),columns=[\"Institution\",\"Country_Type\",\"Country\",\"City\"])\n",
"#\n",
"# aff_lookup = aff_m.merge(inst_m, how='cross')\n",
"#\n",
"# # aff_lookup[\"levehnstein\"] = aff_lookup.apply(\n",
"# # lambda x: edit_distance(x[\"Affiliations\"], x[\"Institution\"]), axis=1)\n",
"#\n",
"# aff_lookup.assign(distance=[*map(edit_distance, aff_lookup.Affiliations, aff_lookup.Institution)])"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 27,
"outputs": [
{
"data": {
"text/plain": "<Axes: ylabel='Frequency'>"
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAj0AAAGsCAYAAAA2QxZ6AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAsLElEQVR4nO3de3gU9b3H8c9uQi4SU0OAVFDxgtwChJAIovGGBYOSA2KxiApewYcCeuhTIVghCIhBi1rxKNjGouClKGApN0XU4wGFY5Bg4CRNRCAVKwsNImxIDDvnD2TrGpBkMslk+b1fz8PzML+d+e2XL0P2w8zsjMeyLEsAAACnOa/bBQAAADQGQg8AADACoQcAABiB0AMAAIxA6AEAAEYg9AAAACMQegAAgBEIPQAAwAiEHgAAYARjQ09VVZUGDhyojRs31nqbTZs2adCgQUpJSdHNN9+soqKiBqwQAAA4ycjQU1lZqQkTJqikpKTW25SVlenee+9Vv3799NZbb6ljx44aM2aMqqqqGrBSAADgFONCT2lpqW6++Wbt3r27TtstXLhQ3bt319ixY3X++edr8uTJ8nq92rFjRwNVCgAAnGRc6Nm0aZN69+6t119/vcZrn3zyiYYMGaLu3bsrKytLa9asCdmuf//+weXY2FitXbtWnTp1apS6AQBA/US6XUBjGz58+AnHfT6fRo8erf/8z//UFVdcoS1btmjSpElKTExUenq6ysrKFBMTo/Hjx+uTTz5R+/btNWXKFLVv376R/wQAAMAO4470nMyiRYt02WWX6bbbblO7du00aNAg/epXv9KCBQskSX6/X0888YQuueQSvfDCCzr77LN1xx136PDhwy5XDgAAasO4Iz0ns2PHDr333ntKTU0Njn333Xe64IILJEkRERHq27evbr/9dknS9OnTdfXVV2vdunXKyspypWYAAFB7hJ7vVVdXKysrS/fdd1/IeGTksRa1atUqGIAkKSoqSm3bttVXX33VqHUCAAB7OL31vQsuuEC7du1Su3btgr/effddLV++XJLUo0cPFRcXB9evqqpSWVmZzjnnHLdKBgAAdUDo+d7w4cNVWFioJ598Ujt37tTy5cs1Z84ctWnTRpI0cuRIrVmzRq+88op27typRx55RNHR0br66qvdLRwAANSKx7Isy+0i3NKxY0e99NJL6t27tyRpw4YNeuKJJ/T3v/9dSUlJuvPOO3XbbbcF11+7dq2eeOIJffnll+rataseeeQRXXzxxW6VDwAA6sDo0AMAAMzB6S0AAGAEQg8AADACoQcAABiB0AMAAIxg3M0J9+//Vk5euu3xSImJZzo+rwnonT30zR76Zh+9s4e+2ffD3knHfu8E40KPZalBdr6GmtcE9M4e+mYPfbOP3tlD3+xzum+c3gIAAEYg9AAAACMQegAAgBEIPQAAwAiEHgAAYARCDwAAMAKhBwAAGIHQAwAAjEDoAQAARiD0AAAAIxB6AACAEQg9AADACIQeAABgBOOeso5jvF6PvF6P22VIkiIiape9AwFLgQCPKgYA2EPoMZDX69HPzjpDkbUMGw0tIaF5rdarPhrQNwf8BB8AgC2EHgN5vR5FRnh1/2ufqnTvIbfLqZX2reP09LBUeb0eQg8AwBZCj8FK9x7Stj0H3S4DAIBG0TTObwAAADQwQg8AADACoQcAABiB0AMAAIxA6AEAAEYg9AAAACMQegAAgBEIPQAAwAiEHgAAYARCDwAAMAKhBwAAGIHQAwAAjEDoAQAARiD0AAAAIxB6AACAEQg9AADACIQeAABgBEIPAAAwAqEHAAAYgdADAACMQOgBAABGIPQAAAAjEHoAAIARCD0AAMAIhB4AAGAEQg8AADACoQcAABiB0AMAAIxA6AEAAEYg9AAAACMQegAAgBEIPQAAwAiEHgAAYARCDwAAMAKhBwAAGKFJhJ6qqioNHDhQGzduPOk627dv19ChQ5WSkqKbbrpJhYWFjVghAAAId66HnsrKSk2YMEElJSUnXcfv92vUqFFKT0/XkiVLlJqaqtGjR8vv9zdipQAAIJy5GnpKS0t18803a/fu3T+53sqVKxUdHa0HH3xQF110kR566CE1b95cq1evbqRKAQBAuHM19GzatEm9e/fW66+//pPrFRQUKC0tTR6PR5Lk8XjUs2dPbdmypRGqBAAAp4NIN998+PDhtVrP5/Opffv2IWOJiYk/eUrsZL7PTY45Pp/T8+LkTO81+5w99M0+emcPfbOvoXrnauiprYqKCkVFRYWMRUVFqaqqqs5zJSae6VRZjTIvQiUkNHe7hCaDfc4e+mYfvbOHvtnndO/CIvRER0fXCDhVVVWKiYmp81z7938ry3KqsmMpNDHxTMfnbUgREd6wDQ/l5Yd19GjA7TJcFY77XFNA3+yjd/bQN/t+2DvJufATFqEnKSlJ+/btCxnbt2+fWrduXee5LEsNsvM11LyoiT4fwz5nD32zj97ZQ9/sc7pvrn9lvTZSUlL06aefyvr+T29ZljZv3qyUlBSXKwMAAOGiyYYen8+nI0eOSJIyMzN18OBBzZw5U6WlpZo5c6YqKio0YMAAl6sEAADhosmGnoyMDK1cuVKSFBcXp3nz5ik/P19DhgxRQUGB5s+frzPOOMPlKgEAQLhoMtf0FBcX/+Ry9+7dtXTp0sYsCQAAnEaa7JEeAAAAJxF6AACAEQg9AADACIQeAABgBEIPAAAwAqEHAAAYgdADAACMQOgBAABGIPQAAAAjEHoAAIARCD0AAMAIhB4AAGAEQg8AADACoQcAABiB0AMAAIxA6AEAAEYg9AAAACMQegAAgBEIPQAAwAiEHgAAYARCDwAAMAKhBwAAGIHQAwAAjEDoAQAARiD0AAAAIxB6AACAEQg9AADACIQeAABgBEIPAAAwAqEHAAAYgdADAACMQOgBAABGIPQAAAAjEHoAAIARCD0AAMAIhB4AAGCESLcLAOoiIiK8cnogYCkQsNwuAwAgQg/CRKu4aB0NWIqPj3W7lDqpPhrQNwf8BB8AaAIIPQgL8bGRivB6dP9rn6p07yG3y6mV9q3j9PSwVHm9HkIPADQBhB6EldK9h7Rtz0G3ywAAhKHwukACAADAJkIPAAAwAqEHAAAYgdADAACMQOgBAABGIPQAAAAjEHoAAIARCD0AAMAIhB4AAGAEQg8AADCCq6GnsrJSkydPVnp6ujIyMpSXl3fSdd955x0NGDBAqampuuWWW7Rt27ZGrBQAAIQ7V0PP7NmzVVhYqAULFmjq1KmaO3euVq9eXWO9kpIS/eY3v9Ho0aP11ltvqXPnzho9erQqKipcqBoAAIQj10KP3+/X4sWL9dBDDyk5OVn9+vXTPffco0WLFtVYd/369Wrfvr0GDx6s8847TxMmTJDP51NpaakLlQMAgHDkWugpKipSdXW1UlNTg2NpaWkqKChQIBAIWfess85SaWmp8vPzFQgEtGTJEsXFxem8885r7LIBAECYinTrjX0+nxISEhQVFRUca9mypSorK3XgwAG1aNEiOH799ddr3bp1Gj58uCIiIuT1ejVv3jz97Gc/q/P7ejyOlF9jPqfnxenFyf2Dfc4e+mYfvbOHvtnXUL1zLfRUVFSEBB5JweWqqqqQ8fLycvl8Pk2ZMkUpKSl69dVXlZ2draVLlyoxMbFO75uYeGb9Cm/keRH+EhKaN8i87HP20Df76J099M0+p3vnWuiJjo6uEW6OL8fExISMP/HEE+rQoYNuvfVWSdL06dM1YMAAvfnmmxo1alSd3nf//m9lWfUo/Ec8nmN/KU7P25AiIrwN9kGMmsrLD+vo0cCpV6ylcNznmgL6Zh+9s4e+2ffD3knOhR/XQk9SUpLKy8tVXV2tyMhjZfh8PsXExCg+Pj5k3W3btun2228PLnu9XnXq1El79uyp8/talhpk52uoeXF6YJ9rOuibffTOHvpmn9N9c+1C5s6dOysyMlJbtmwJjuXn56tbt27yekPLat26tT7//POQsS+++ELnnHNOY5QKAABOA66FntjYWA0ePFg5OTnaunWr1q5dq7y8PI0YMULSsaM+R44ckSTdfPP
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"affiliations[\"levehnstein\"].plot(kind=\"hist\")"
],
"metadata": {
"collapsed": false
}
},
1 year ago
{
"cell_type": "code",
"execution_count": 74,
"outputs": [
{
"data": {
"text/plain": "<Axes: ylabel='Frequency'>"
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAk8AAAGdCAYAAAAL2ZfXAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAtxklEQVR4nO3de1zVVb7/8fcG4qLEL+Q2oh5nRo830g2BaJlpPJxJ00aPl86Yk1nO4JhInZm0kI5ihiZeKofSrDRNS7NMj5e0qRynMbMJBTKGHpAzyfEGmEoGgrD37w+HfWbnBdZu295bXs/Hg0futb7ftT659pa33+/yi8Vut9sFAACAZvHzdAEAAAC+hPAEAABggPAEAABggPAEAABggPAEAABggPAEAABggPAEAABggPAEAABggPAEAABggPAEAABgIMDTBVzLTp78Ru784TcWixQRcb3bx8XVwXr5DtbKd7BWvsXX1qux3qYQnq4iu11X5c1ytcbF1cF6+Q7WynewVr7lWlsvbtsBAAAYIDwBAAAYIDwBAAAYIDwBAAAYIDwBAAAYIDwBAAAYIDwBAAAYIDwBAAAYIDwBAAAYIDwBAAAYIDwBAAAYIDwBAAAYIDwBAAAYCPB0AYC38vOzyM/P8r3H8ff/4f6OYrPZZbNdQz+6HAC8EOEJuAQ/P4v+3w2tFOCG4BMe3toNFTVPfYNNZ05XE6AA4CoiPAGX4OdnUYC/nx5ad0Cl5Wc9XU6zdI4O1bO/TJCfn4XwBABXEeEJuILS8rP6/GiVp8sAAHgRNowDAAAYIDwBAAAYIDwBAAAYIDwBAAAYIDwBAAAYIDwBAAAYIDwBAAAYIDwBAAAYIDwBAAAYIDwBAAAYIDwBAAAYIDwBAAAYIDwBAAAYIDwBAAAYIDwBAAAYIDwBAAAYIDwBAAAYIDwBAAAYIDwBAAAYIDwBAAAYIDwBAAAYIDwBAAAYIDwBAAAYIDwBAAAYIDwBAAAYIDwBAAAYIDwBAAAYIDwBAAAY8Gh4OnbsmCZNmqSbbrpJKSkpeuWVVxx9RUVFGjNmjKxWq0aNGqWDBw86nbt161YNGjRIVqtVU6ZM0ddff+3os9vtWrhwofr27avk5GTl5OTIZrM5+k+dOqWpU6cqISFBKSkp2rx5s9PYTc0NAABaLo+Gp4cfflitWrXSxo0bNWPGDD3zzDP64x//qOrqaqWmpiopKUkbN25UQkKCJk2apOrqaklSYWGhMjMzlZaWpvXr16uqqkoZGRmOcVeuXKmtW7cqNzdXS5Ys0ZYtW7Ry5UpHf0ZGhr755hutX79ekydP1uOPP67CwkJJanJuAADQsnksPJ05c0b5+fmaPHmyfvzjH2vQoEHq37+/9u7dq+3btysoKEjTp09Xp06dlJmZqdatW2vHjh2SpDVr1mjIkCEaMWKEunXrppycHO3evVtlZWWSpNWrVys9PV1JSUnq27evHnnkEa1du1aSdPjwYe3atUtPPvmkunTpojFjxugXv/iFXnvtNUlqcm4AANCyeSw8BQcHKyQkRBs3btT58+d16NAh7d+/X927d1dBQYESExNlsVgkSRaLRTfddJPy8/MlSQUFBUpKSnKM1bZtW8XGxqqgoEAnTpzQsWPH1Lt3b0d/YmKijhw5ovLychUUFKht27Zq3769U/+BAwccY19pbgAA0LJ5LDwFBQVp5syZWr9+vaxWq4YMGaLbbrtNY8aMUUVFhaKjo52Oj4iI0PHjxyVJ5eXll+2vqKiQJKf+yMhISXL0X+rcEydOSFKTcwMAgJYtwJOTf/nll7r99tt1//33q6SkRHPmzNHNN9+smpoaBQYGOh0bGBiouro6SdK5c+cu23/u3DnH63/tk6S6uromx26q38Q/L165TeN47h4X1x7eI2b4bPkO1sq3+Np6NbdOj4WnvXv36s0339Tu3bsVHBysnj176sSJE1q6dKk6dOhwUVipq6tTcHCwpAtXrS7VHxIS4hSUgoKCHL+WpJCQkMue29TYjf0mIiKuNz7Hk+Pi2hAe3trTJfgsPlu+g7XyLdfaenksPB08eFAdO3Z0CiU9evTQsmXLlJSUpMrKSqfjKysrHbfTYmJiLtkfFRWlmJgYSRduvzXua2q8ldfYf7lzrzT2d2/lNcfJk9/Ibjc+7bIslgtvQHePi4v5+/v5bAg5depbNTTYmj4QDny2fAdr5Vt8bb0a622Kx/Y8RUdH66uvvnK6ynPo0CG1b99eVqtVBw4ckP2fv9N2u1379++X1WqVJFmtVuXl5TnOO3bsmI4dOyar1aqYmBjFxsY69efl5Sk2NlbR0dGKj4/XkSNHnPYw5eXlKT4+3jH2leY2Ybe7/+tqjcvXxb/PvszTv3+++MXvm+98sVa+9eVr69UcHgtPKSkpuu666/T444/r73//uz744AMtW7ZM9957rwYPHqyqqiplZ2ertLRU2dnZqqmp0ZAhQyRJY8eO1ebNm7VhwwYVFxdr+vTpGjhwoDp06ODoX7hwofbt26d9+/Zp0aJFGj9+vCSpQ4cOuvXWWzVt2jQVFxdrw4YN2rp1q8aNGydJTc4NAABaNo/dtrv++uv1yiuvKDs7W6NHj1abNm00efJk/ed//qcsFoteeOEFzZo1S2+88Ya6du2q5cuXq1WrVpKkhIQEPfHEE1qyZInOnDmjfv36ac6cOY6xJ06cqJMnTyotLU3+/v4aPXq0JkyY4OjPyclRZmam7r77bkVFRWnu3Lnq1auXJCk0NPSKcwMAgJbNYrc39yIVTFVWun/PU2Tk9W4fFxcLCLiw52nokg/1+dEqT5fTLHGxYdqW3l+nTn2r+nr2PJngs+U7WCvf4mvr1VhvU/jBwAAAAAYITwAAAAYITwAAAAYITwAAAAYITwAAAAYITwAAAAYITwAAAAYITwAAAAYITwAAAAYITwAAAAYITwAAAAYITwAAAAYITwAAAAYITwAAAAYITwAAAAYITwAAAAYITwAAAAYITwAAAAYITwAAAAYCPF0AAPfy9/etvxPZbHbZbHZPlwEAzUZ4Aq4RUaFBarDZFRYW4ulSjNQ32HTmdDUBCoDPIDwB14iwkAD5+1n00LoDKi0/6+lymqVzdKie/WWC/PwshCcAPoPwBFxjSsvP6vOjVZ4uAwCuWb61OQIAAMDDCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGCE8AAAAGPBqe6urqNHv2bPXu3Vu33HKLFi9eLLvdLkkqKirSmDFjZLVaNWrUKB08eNDp3K1bt2rQoEGyWq2aMmWKvv76a0ef3W7XwoUL1bdvXyUnJysnJ0c2m83Rf+rUKU2dOlUJCQlKSUnR5s2bncZuam4AANByeTQ8Pfnkk/roo4/08ssva9GiRXrjjTe0fv16VVdXKzU1VUlJSdq4caMSEhI0adIkVVdXS5IKCwuVmZmptLQ0rV+/XlVVVcrIyHCMu3LlSm3dulW5ublasmSJtmzZopUrVzr6MzIy9M0332j9+vWaPHmyHn/8cRUWFkpSk3MDAICWLcBTE58+fVpvvfWWVq5cqV69ekmSHnjgARUUFCggIEBBQUGaPn26LBaLMjMz9ec//1k7duzQyJEjtWbNGg0ZMkQjRoyQJOXk5Oj2229XWVmZOnTooNWrVys9PV1JSUmSpEceeUTPPvusJk6cqMOHD2vXrl16//331b59e3Xp0kX5+fl67bXX1KtXL23fvv2KcwMAgJbNY1ee8vLyFBoaquTkZEdbamqq5s2bp4KCAiUmJspisUiSLBaLbrr
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"affiliations[\"token_overlap\"].plot(kind=\"hist\")"
],
"metadata": {
"collapsed": false
}
},
2 years ago
{
"cell_type": "code",
"execution_count": 29,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) \n136998 WOS:000321029900001 \\\n136999 WOS:000321029900001 \n137000 WOS:000321029900001 \n137001 WOS:000321029900001 \n137002 WOS:000321029900001 \n... ... \n2426115 WOS:000934156000001 \n2426116 WOS:000934156000001 \n2426117 WOS:000934156000001 \n2426118 WOS:000934156000001 \n2426119 WOS:000934156000001 \n\n Affiliations \n136998 AGENCY FOR SCIENCE TECHNOLOGY & RESEARCH (A*STAR) \\\n136999 AGENCY FOR SCIENCE TECHNOLOGY & RESEARCH (A*STAR) \n137000 AGENCY FOR SCIENCE TECHNOLOGY & RESEARCH (A*STAR) \n137001 AGENCY FOR SCIENCE TECHNOLOGY & RESEARCH (A*STAR) \n137002 A*STAR - BIOINFORMATICS INSTITUTE (BII) \n... ... \n2426115 A*STAR - INSTITUTE OF HIGH PERFORMANCE COMPUTI... \n2426116 A*STAR - INSTITUTE OF HIGH PERFORMANCE COMPUTI... \n2426117 A*STAR - INSTITUTE OF HIGH PERFORMANCE COMPUTI... \n2426118 A*STAR - INSTITUTE OF HIGH PERFORMANCE COMPUTI... \n2426119 A*STAR - INSTITUTE OF HIGH PERFORMANCE COMPUTI... \n\n Affiliations_merged \n136998 AGENCY FOR SCIENCE TECHNOLOGY & RESEARCH A*STAR \\\n136999 AGENCY FOR SCIENCE TECHNOLOGY & RESEARCH A*STAR \n137000 AGENCY FOR SCIENCE TECHNOLOGY & RESEARCH A*STAR \n137001 AGENCY FOR SCIENCE TECHNOLOGY & RESEARCH A*STAR \n137002 A*STAR - BIOINFORMATICS INSTITUTE BII \n... ... \n2426115 A*STAR - INSTITUTE OF HIGH PERFORMANCE COMPUTI... \n2426116 A*STAR - INSTITUTE OF HIGH PERFORMANCE COMPUTI... \n2426117 A*STAR - INSTITUTE OF HIGH PERFORMANCE COMPUTI... \n2426118 A*STAR - INSTITUTE OF HIGH PERFORMANCE COMPUTI... \n2426119 A*STAR - INSTITUTE OF HIGH PERFORMANCE COMPUTI... \n\n Address Country \n136998 Univ Oulu, Ctr Machine Vis Res, SF-90100 Oulu... Finland \\\n136999 Univ Calif Santa Barbara, Dept Comp Sci, Sant... United States \n137000 Chinese Acad Sci, NLPR, Inst Automat, Beijing... China \n137001 Natl Univ Singapore, Bioinformat Inst, A STAR... Singapore \n137002 Univ Oulu, Ctr Machine Vis Res, SF-90100 Oulu... Finland \n... ... ... \n2426115 Chinese Acad Sci, Ningbo Inst Mat Technol & E... China \n2426116 Univ N Carolina, Dept Radiol, Chapel Hill, NC... United States \n2426117 Univ Cambridge, DAMTP, Cambridge CB2 1TN, Eng... United Kingdom \n2426118 Univ Leeds, Computat Med & Royal Acad, Leeds ... United Kingdom \n2426119 Katholieke Univ Leuven, B-3000 Leuven, Belgium Belgium \n\n City Country_Type Institution levehnstein \n136998 Oulu EU UNIV OULU 45 \n136999 Santa Barbara Other UNIV CALIF SANTA BARBARA 37 \n137000 Beijing China CHINESE ACAD SCI 40 \n137001 Singapore Other NATL UNIV SINGAPORE 41 \n137002 Oulu EU UNIV OULU 35 \n... ... ... ... ... \n2426115 Beijing China CHINESE ACAD SCI 47 \n2426116 Carolina Other UNIV N CAROLINA 47 \n2426117 Cambridge Other UNIV CAMBRIDGE 48 \n2426118 Leeds Other UNIV LEEDS 50 \n2426119 Leuven EU KATHOLIEKE UNIV LEUVEN 45 \n\n[711 rows x 9 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Affiliations</th>\n <th>Affiliations_merged</th>\n <th>Address</th>\n <th>Country</th>\n <th>City</th>\n <th>Country_Type</th>\n <th>Institution</th>\n <th>levehnstein</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>136998</th>\n <td>WOS:000321029900001</td>\n <td>AGENCY FOR SCIENCE TECHNOLOGY &amp; RESEARCH (A*STAR)</td>\n <td>AGENCY FOR SCIENCE TECHNOLOGY &amp; RESEARCH A*STAR</td>\n <td>Univ Oulu, Ctr Machine Vis Res, SF-90100 Oulu...</td>\n <td>Finland</td>\n <td>Oulu</td>\n <td>EU</td>\n <td>UNIV OULU</td>\n <td>45</td>\n </tr>\n <tr>\n <th>136999</th>\n <td>WOS:000321029900001</td>\n <td>AGENCY FOR SCIENCE TECHNOLOGY &amp; RESEARCH (A*STAR)</td>\n <td>AGENCY FOR SCIENCE TECHNOLOGY &amp; RESEARCH A*STAR</td>\n <td>Univ Calif Santa Barbara, Dept Comp Sci, Sant...</td>\n <td>United States</td>\n <td>Santa Barbara</td>\n <td>Other</td>\n <td>UNIV CALIF SANTA BARBARA</td>\n <td>37</td>\n </tr>\n <tr>\n <th>137000</th>\n <td>WOS:000321029900001</td>\n <td>AGENCY FOR SCIENCE TECHNOLOGY &amp; RESEARCH (A*STAR)</td>\n <td>AGENCY FOR SCIENCE TECHNOLOGY &amp; RESEARCH A*STAR</td>\n <td>Chinese Acad Sci, NLPR, Inst Automat, Beijing...</td>\n <td>China</td>\n <td>Beijing</td>\n <td>China</td>\n <td>CHINESE ACAD SCI</td>\n <td>40</td>\n </tr>\n <tr>\n <th>137001</th>\n <td>WOS:000321029900001</td>\n <td>AGENCY FOR SCIENCE TECHNOLOGY &amp; RESEARCH (A*STAR)</td>\n <td>AGENCY FOR SCIENCE TECHNOLOGY &amp; RESEARCH A*STAR</td>\n <td>Natl Univ Singapore, Bioinformat Inst, A STAR...</td>\n <td>Singapore</td>\n <td>Singapore</td>\n <td>Other</td>\n <td>NATL UNIV SINGAPORE</td>\n <td>41</td>\n </tr>\n <tr>\n <th>137002</th>\n <td>WOS:000321029900001</td>\n <td>A*STAR - BIOINFORMATICS INSTITUTE (BII)</td>\n <td>A*STAR - BIOINFORMATICS INSTITUTE BII</td>\n <td>Univ Oulu, Ctr Machine Vis Res, SF-90100 Oulu...</td>\n <td>Finland</td>\n <td>Oulu</td>\n <td>EU</td>\n <td>UNIV OULU</td>\n <td>35</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>2426115</th>\n <td>WOS:000934156000001</td>\n <td>A*STAR - INSTITUTE OF HIGH PERFORMANCE COMPUTI...</td>\n <td>A*STAR - INSTITUTE OF HIGH PERFORMANCE COMPUTI...</td>\n <td>Chinese Acad Sci, Ningbo Inst Mat Technol &amp; E...</td>\n <td>China</td>\n <td>Beijing</td>\n <td>China</td>\n <td>CHINESE ACAD SCI</td>\n <td>47</td>\n </tr>\n <tr>\n <th>2426116</th>\n <td>WOS:000934156000001</td>\n <td>A*STAR - INSTITUTE OF HIGH PERFORMANCE COMPUTI...</td>\n <td>A*STAR - INSTITUTE OF HIGH PERFORMANCE COMPUTI...</td>\n <td>Univ N Carolina, Dept Radiol, Chapel Hill, NC...</td>\n <td>United States</td>\n <td>Carolina</td>\n <td>Other</td>\n <td>UNIV N CAROLINA</td>\n <td>47</td>\n </tr>\n <tr>\n <th>2426117</th>\n <td>WOS:000934156000001</td>\n <td>A*STAR - INSTITUTE OF HIGH PERFORMANCE COMPUTI...</td>\n <td>A*STAR - INSTITUTE OF HIGH PERFORMANCE COMPUTI...</td>\n <td>Univ Cambridge, DAMTP, Cambridge CB2 1TN, Eng...</td>\n <td>United Kingdom</td>\n <td>Cambridge</td>\n <td>Other</td>\n <td>UNIV CAMBRIDGE</td>\n <td>48</td>\n </tr
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"affiliations[affiliations[\"Affiliations\"].str.contains(\"A*STAR\",regex=False)]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 133,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>UT (Unique WOS ID)</th>\n",
" <th>Affiliations</th>\n",
" <th>Affiliations_merged</th>\n",
" <th>Address</th>\n",
" <th>Country</th>\n",
" <th>City</th>\n",
" <th>Country_Type</th>\n",
" <th>Institution</th>\n",
" <th>levehnstein</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2430154</th>\n",
" <td>WOS:000947693400001</td>\n",
" <td>UNIVERSITAT POLITECNICA DE VALENCIA</td>\n",
" <td>UNIVERSITAT POLITECNICA DE VALENCIA</td>\n",
" <td>Univ Politecn Valencia, European Inst Innovat...</td>\n",
" <td>Spain</td>\n",
" <td>Valencia</td>\n",
" <td>EU</td>\n",
" <td>UNIV POLITECN VALENCIA</td>\n",
" <td>13</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2430132</th>\n",
" <td>WOS:000947693400001</td>\n",
" <td>SHANGHAITECH UNIVERSITY</td>\n",
" <td>SHANGHAITECH UNIVERSITY</td>\n",
" <td>ShanghaiTech Univ, Shanghai Inst Adv Immunoch...</td>\n",
" <td>China</td>\n",
" <td>Shanghai</td>\n",
" <td>China</td>\n",
" <td>SHANGHAITECH UNIV</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2430139</th>\n",
" <td>WOS:000947693400001</td>\n",
" <td>SHANGHAI OCEAN UNIVERSITY</td>\n",
" <td>SHANGHAI UNIVERSITY</td>\n",
" <td>Shanghai Ocean Univ, Coll Fisheries &amp; Life Sc...</td>\n",
" <td>China</td>\n",
" <td>Shanghai</td>\n",
" <td>China</td>\n",
" <td>SHANGHAI OCEAN UNIV</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2430146</th>\n",
" <td>WOS:000947693400001</td>\n",
" <td>SHANGHAI JIAO TONG UNIVERSITY</td>\n",
" <td>SHANGHAI UNIVERSITY</td>\n",
" <td>Shanghai Jiao Tong Univ, Ruijin Hosp, Sch Med...</td>\n",
" <td>China</td>\n",
" <td>Meda</td>\n",
" <td>China</td>\n",
" <td>SHANGHAI JIAO TONG UNIV</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2430125</th>\n",
" <td>WOS:000947693400001</td>\n",
" <td>HUZHOU UNIVERSITY</td>\n",
" <td>HUZHOU UNIVERSITY</td>\n",
" <td>Huzhou Univ, Sch Informat Engn, Huzhou 313000...</td>\n",
" <td>China</td>\n",
" <td>Huzhou</td>\n",
" <td>China</td>\n",
" <td>HUZHOU UNIV</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2430113</th>\n",
" <td>WOS:000946746700001</td>\n",
" <td>SUZHOU UNIVERSITY OF SCIENCE &amp; TECHNOLOGY</td>\n",
" <td>SUZHOU UNIVERSITY OF SCIENCE &amp; TECHNOLOGY</td>\n",
" <td>Suzhou Univ Sci &amp; Technol, Sch Elect &amp; Inform...</td>\n",
" <td>China</td>\n",
" <td>Suzhou</td>\n",
" <td>China</td>\n",
" <td>SUZHOU UNIV SCI &amp; TECHNOL</td>\n",
" <td>16</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2430118</th>\n",
" <td>WOS:000946746700001</td>\n",
" <td>POLYTECHNIC UNIVERSITY OF MILAN</td>\n",
" <td>UNIVERSITY OF MILAN</td>\n",
" <td>Politecn Milan, Dept Mech Engn, Milan, Italy;</td>\n",
" <td>Italy</td>\n",
" <td>Milano</td>\n",
" <td>EU</td>\n",
" <td>POLITECN MILAN</td>\n",
" <td>18</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2430123</th>\n",
" <td>WOS:000946746700001</td>\n",
" <td>HONG KONG POLYTECHNIC UNIVERSITY</td>\n",
" <td>HONG KONG POLYTECHNIC UNIVERSITY</td>\n",
" <td>Hong Kong Polytech Univ, Dept Comp, Hong Kong...</td>\n",
" <td>China</td>\n",
" <td>Hong Kong</td>\n",
" <td>China</td>\n",
" <td>HONG KONG POLYTECH UNIV</td>\n",
" <td>9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2430111</th>\n",
" <td>WOS:000945297300001</td>\n",
" <td>UNIVERSITY OF PANNONIA</td>\n",
" <td>UNIVERSITY OF PANNONIA</td>\n",
" <td>Univ Pannonia, Dept Elect Engn &amp; Informat Sys...</td>\n",
" <td>Hungary</td>\n",
" <td>Veszprém</td>\n",
" <td>EU</td>\n",
" <td>UNIV PANNONIA</td>\n",
" <td>9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2430107</th>\n",
" <td>WOS:000945297300001</td>\n",
" <td>SHENYANG UNIVERSITY OF TECHNOLOGY</td>\n",
" <td>SHENYANG UNIVERSITY</td>\n",
" <td>Shenyang Univ Technol, Sch Elect Engn, Dept B...</td>\n",
" <td>China</td>\n",
" <td>Shenyang</td>\n",
" <td>China</td>\n",
" <td>SHENYANG UNIV TECHNOL</td>\n",
" <td>12</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" UT (Unique WOS ID) Affiliations \n",
"2430154 WOS:000947693400001 UNIVERSITAT POLITECNICA DE VALENCIA \\\n",
"2430132 WOS:000947693400001 SHANGHAITECH UNIVERSITY \n",
"2430139 WOS:000947693400001 SHANGHAI OCEAN UNIVERSITY \n",
"2430146 WOS:000947693400001 SHANGHAI JIAO TONG UNIVERSITY \n",
"2430125 WOS:000947693400001 HUZHOU UNIVERSITY \n",
"2430113 WOS:000946746700001 SUZHOU UNIVERSITY OF SCIENCE & TECHNOLOGY \n",
"2430118 WOS:000946746700001 POLYTECHNIC UNIVERSITY OF MILAN \n",
"2430123 WOS:000946746700001 HONG KONG POLYTECHNIC UNIVERSITY \n",
"2430111 WOS:000945297300001 UNIVERSITY OF PANNONIA \n",
"2430107 WOS:000945297300001 SHENYANG UNIVERSITY OF TECHNOLOGY \n",
"\n",
" Affiliations_merged \n",
"2430154 UNIVERSITAT POLITECNICA DE VALENCIA \\\n",
"2430132 SHANGHAITECH UNIVERSITY \n",
"2430139 SHANGHAI UNIVERSITY \n",
"2430146 SHANGHAI UNIVERSITY \n",
"2430125 HUZHOU UNIVERSITY \n",
"2430113 SUZHOU UNIVERSITY OF SCIENCE & TECHNOLOGY \n",
"2430118 UNIVERSITY OF MILAN \n",
"2430123 HONG KONG POLYTECHNIC UNIVERSITY \n",
"2430111 UNIVERSITY OF PANNONIA \n",
"2430107 SHENYANG UNIVERSITY \n",
"\n",
" Address Country \n",
"2430154 Univ Politecn Valencia, European Inst Innovat... Spain \\\n",
"2430132 ShanghaiTech Univ, Shanghai Inst Adv Immunoch... China \n",
"2430139 Shanghai Ocean Univ, Coll Fisheries & Life Sc... China \n",
"2430146 Shanghai Jiao Tong Univ, Ruijin Hosp, Sch Med... China \n",
"2430125 Huzhou Univ, Sch Informat Engn, Huzhou 313000... China \n",
"2430113 Suzhou Univ Sci & Technol, Sch Elect & Inform... China \n",
"2430118 Politecn Milan, Dept Mech Engn, Milan, Italy; Italy \n",
"2430123 Hong Kong Polytech Univ, Dept Comp, Hong Kong... China \n",
"2430111 Univ Pannonia, Dept Elect Engn & Informat Sys... Hungary \n",
"2430107 Shenyang Univ Technol, Sch Elect Engn, Dept B... China \n",
"\n",
" City Country_Type Institution levehnstein \n",
"2430154 Valencia EU UNIV POLITECN VALENCIA 13 \n",
"2430132 Shanghai China SHANGHAITECH UNIV 6 \n",
"2430139 Shanghai China SHANGHAI OCEAN UNIV 6 \n",
"2430146 Meda China SHANGHAI JIAO TONG UNIV 6 \n",
"2430125 Huzhou China HUZHOU UNIV 6 \n",
"2430113 Suzhou China SUZHOU UNIV SCI & TECHNOL 16 \n",
"2430118 Milano EU POLITECN MILAN 18 \n",
"2430123 Hong Kong China HONG KONG POLYTECH UNIV 9 \n",
"2430111 Veszprém EU UNIV PANNONIA 9 \n",
"2430107 Shenyang China SHENYANG UNIV TECHNOL 12 "
]
},
"execution_count": 133,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"affiliations = affiliations.sort_values(by=[record_col,\"Affiliations\",\"levehnstein\"], ascending=[False,False,True])\n",
"affiliations_merge = affiliations.drop_duplicates(subset=[record_col,\"Affiliations\"])\n",
"affiliations_merge.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"WoS Categories\n",
" Engineering, Electrical & Electronic 1703\n",
"Computer Science, Artificial Intelligence 1366\n",
"Computer Science, Information Systems 973\n",
" Telecommunications 834\n",
" Imaging Science & Photographic Technology 762\n",
" ... \n",
" Crystallography 1\n",
"Mining & Mineral Processing 1\n",
" Art 1\n",
"Archaeology 1\n",
"Physics, Mathematical 1\n",
"Name: count, Length: 379, dtype: int64"
]
},
"execution_count": 91,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos_cat = wos.groupby(record_col)[\"WoS Categories\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
"wos_cat[\"WoS Categories\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Research Areas\n",
"Engineering 3740\n",
"Computer Science 3466\n",
"Telecommunications 888\n",
"Imaging Science & Photographic Technology 779\n",
"Remote Sensing 716\n",
" ... \n",
"Otorhinolaryngology 1\n",
"Medical Ethics 1\n",
"Anesthesiology 1\n",
"Biomedical Social Sciences 1\n",
"History & Philosophy of Science 1\n",
"Name: count, Length: 141, dtype: int64"
]
},
"execution_count": 92,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos_areas = wos.groupby(record_col)[\"Research Areas\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
"wos_areas[\"Research Areas\"] = wos_areas[\"Research Areas\"].str.strip()\n",
"wos_areas[\"Research Areas\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Research Areas\n",
"Engineering 3740\n",
"Computer Science 3466\n",
"Telecommunications 888\n",
"Imaging Science & Photographic Technology 779\n",
"Remote Sensing 716\n",
" ... \n",
"Otorhinolaryngology 1\n",
"Medical Ethics 1\n",
"Anesthesiology 1\n",
"Biomedical Social Sciences 1\n",
"History & Philosophy of Science 1\n",
"Name: count, Length: 141, dtype: int64"
]
},
"execution_count": 93,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos_areas[\"Research Areas\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Domain_English', 'Field_English', 'SubField_English']"
]
},
"execution_count": 94,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[c for c in wos.columns if \"_English\" in c]"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [],
"source": [
"import seaborn as sns\n",
"from matplotlib.ticker import MaxNLocator\n",
"import math"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {},
"outputs": [],
"source": [
"wos = wos[((wos[\"Publication Year\"]<2023) & (~wos['Domain_English'].isna()))]\n",
"\n",
"metrix_levels = [c for c in wos.columns if \"_English\" in c]\n",
"for m in metrix_levels:\n",
" wos[m] = wos[m].replace({\"article-level classification\":\"Miscellaneous\"})\n"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Publication Type</th>\n",
" <th>Authors</th>\n",
" <th>Book Authors</th>\n",
" <th>Book Editors</th>\n",
" <th>Book Group Authors</th>\n",
" <th>Author Full Names</th>\n",
" <th>Book Author Full Names</th>\n",
" <th>Group Authors</th>\n",
" <th>Article Title</th>\n",
" <th>Source Title</th>\n",
" <th>...</th>\n",
" <th>Web of Science Record</th>\n",
" <th>issn_var</th>\n",
" <th>issn</th>\n",
" <th>Domain_English</th>\n",
" <th>Field_English</th>\n",
" <th>SubField_English</th>\n",
" <th>2.00 SEQ</th>\n",
" <th>Source_title</th>\n",
" <th>srcid</th>\n",
" <th>issn_type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>J</td>\n",
" <td>Salucci, M; Arrebola, M; Shan, T; Li, MK</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Salucci, Marco; Arrebola, Manuel; Shan, Tao; L...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Artificial Intelligence: New Frontiers in Real...</td>\n",
" <td>IEEE TRANSACTIONS ON ANTENNAS AND PROPAGATION</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>issn</td>\n",
" <td>0018926x</td>\n",
" <td>Applied Sciences</td>\n",
" <td>Information &amp; Communication Technologies</td>\n",
" <td>Networking &amp; Telecommunications</td>\n",
" <td>37</td>\n",
" <td>IEEE Transactions on Antennas and Propagation</td>\n",
" <td>1.733700e+04</td>\n",
" <td>issn1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9714</th>\n",
" <td>J</td>\n",
" <td>Huang, Y; Fu, ZT; Franzke, CLE</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Huang, Yu; Fu, Zuntao; Franzke, Christian L. E.</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Detecting causality from time series in a mach...</td>\n",
" <td>CHAOS</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>issn</td>\n",
" <td>10541500</td>\n",
" <td>Natural Sciences</td>\n",
" <td>Physics &amp; Astronomy</td>\n",
" <td>Fluids &amp; Plasmas</td>\n",
" <td>170</td>\n",
" <td>Chaos</td>\n",
" <td>2.743000e+04</td>\n",
" <td>issn2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9697</th>\n",
" <td>J</td>\n",
" <td>Feng, DC; Cetiner, B; Kakavand, MRA; Taciroglu, E</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Feng, De-Cheng; Cetiner, Barbaros; Kakavand, M...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Data-Driven Approach to Predict the Plastic Hi...</td>\n",
" <td>JOURNAL OF STRUCTURAL ENGINEERING</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>issn</td>\n",
" <td>07339445</td>\n",
" <td>Applied Sciences</td>\n",
" <td>Engineering</td>\n",
" <td>Civil Engineering</td>\n",
" <td>23</td>\n",
" <td>Journal of Structural Engineering (United States)</td>\n",
" <td>1.630500e+04</td>\n",
" <td>issn1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9699</th>\n",
" <td>J</td>\n",
" <td>Zhao, YL; Dong, S; Jiang, FY; Soares, CG</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Zhao, Yuliang; Dong, Sheng; Jiang, Fengyuan; G...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>System Reliability Analysis of an Offshore Jac...</td>\n",
" <td>JOURNAL OF OCEAN UNIVERSITY OF CHINA</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>issn</td>\n",
" <td>16725182</td>\n",
" <td>Applied Sciences</td>\n",
" <td>Agriculture, Fisheries &amp; Forestry</td>\n",
" <td>Fisheries</td>\n",
" <td>3</td>\n",
" <td>Journal of Ocean University of China</td>\n",
" <td>6.100153e+09</td>\n",
" <td>issn2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9701</th>\n",
" <td>J</td>\n",
" <td>Li, XH; Yang, DK; Yang, JS; Zheng, G; Han, GQ;...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Li, Xiaohui; Yang, Dongkai; Yang, Jingsong; Zh...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Analysis of coastal wind speed retrieval from ...</td>\n",
" <td>REMOTE SENSING OF ENVIRONMENT</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>issn</td>\n",
" <td>00344257</td>\n",
" <td>Applied Sciences</td>\n",
" <td>Engineering</td>\n",
" <td>Geological &amp; Geomatics Engineering</td>\n",
" <td>26</td>\n",
" <td>Remote Sensing of Environment</td>\n",
" <td>1.250300e+04</td>\n",
" <td>issn1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3066</th>\n",
" <td>J</td>\n",
" <td>He, Q; Zha, C; Song, W; Hao, ZZ; Du, YL; Liott...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>He, Qi; Zha, Cheng; Song, Wei; Hao, Zengzhou; ...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Improved Particle Swarm Optimization for Sea S...</td>\n",
" <td>ENERGIES</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>eissn</td>\n",
" <td>19961073</td>\n",
" <td>Applied Sciences</td>\n",
" <td>Enabling &amp; Strategic Technologies</td>\n",
" <td>Energy</td>\n",
" <td>14</td>\n",
" <td>Energies</td>\n",
" <td>6.293200e+04</td>\n",
" <td>issn1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5097</th>\n",
" <td>J</td>\n",
" <td>Hasan, MM; Popp, J; Olah, J</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Hasan, Md Morshadul; Popp, Jozsef; Olah, Judit</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Current landscape and influence of big data on...</td>\n",
" <td>JOURNAL OF BIG DATA</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>eissn</td>\n",
" <td>21961115</td>\n",
" <td>Applied Sciences</td>\n",
" <td>Information &amp; Communication Technologies</td>\n",
" <td>Artificial Intelligence &amp; Image Processing</td>\n",
" <td>31</td>\n",
" <td>Journal of Big Data</td>\n",
" <td>2.110079e+10</td>\n",
" <td>issn1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11369</th>\n",
" <td>J</td>\n",
" <td>Li, Y; Cheng, G; Pang, YS; Kuai, M</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Li, Yong; Cheng, Gang; Pang, Yusong; Kuai, Moshen</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Planetary Gear Fault Diagnosis via Feature Ima...</td>\n",
" <td>SENSORS</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>eissn</td>\n",
" <td>14248220</td>\n",
" <td>Natural Sciences</td>\n",
" <td>Chemistry</td>\n",
" <td>Analytical Chemistry</td>\n",
" <td>149</td>\n",
" <td>Sensors (Switzerland)</td>\n",
" <td>1.301240e+05</td>\n",
" <td>issn1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11368</th>\n",
" <td>J</td>\n",
" <td>Zeng, R; Rossiter, DG; Zhang, JP; Cai, K; Gao,...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Zeng, Rong; Rossiter, David G.; Zhang, Jiapeng...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>How Well Can Reflectance Spectroscopy Allocate...</td>\n",
" <td>AGRONOMY-BASEL</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>eissn</td>\n",
" <td>20734395</td>\n",
" <td>Natural Sciences</td>\n",
" <td>Biology</td>\n",
" <td>Plant Biology &amp; Botany</td>\n",
" <td>147</td>\n",
" <td>Agronomy</td>\n",
" <td>2.110045e+10</td>\n",
" <td>issn1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11362</th>\n",
" <td>J</td>\n",
" <td>Jia, Y; Jin, SG; Savi, P; Gao, Y; Tang, J; Che...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Jia, Yan; Jin, Shuanggen; Savi, Patrizia; Gao,...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>GNSS-R Soil Moisture Retrieval Based on a XGbo...</td>\n",
" <td>REMOTE SENSING</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>eissn</td>\n",
" <td>20724292</td>\n",
" <td>Applied Sciences</td>\n",
" <td>Engineering</td>\n",
" <td>Geological &amp; Geomatics Engineering</td>\n",
" <td>26</td>\n",
" <td>Remote Sensing</td>\n",
" <td>8.643000e+04</td>\n",
" <td>issn1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>8592 rows × 81 columns</p>\n",
"</div>"
],
"text/plain": [
" Publication Type Authors \n",
"0 J Salucci, M; Arrebola, M; Shan, T; Li, MK \\\n",
"9714 J Huang, Y; Fu, ZT; Franzke, CLE \n",
"9697 J Feng, DC; Cetiner, B; Kakavand, MRA; Taciroglu, E \n",
"9699 J Zhao, YL; Dong, S; Jiang, FY; Soares, CG \n",
"9701 J Li, XH; Yang, DK; Yang, JS; Zheng, G; Han, GQ;... \n",
"... ... ... \n",
"3066 J He, Q; Zha, C; Song, W; Hao, ZZ; Du, YL; Liott... \n",
"5097 J Hasan, MM; Popp, J; Olah, J \n",
"11369 J Li, Y; Cheng, G; Pang, YS; Kuai, M \n",
"11368 J Zeng, R; Rossiter, DG; Zhang, JP; Cai, K; Gao,... \n",
"11362 J Jia, Y; Jin, SG; Savi, P; Gao, Y; Tang, J; Che... \n",
"\n",
" Book Authors Book Editors Book Group Authors \n",
"0 NaN NaN NaN \\\n",
"9714 NaN NaN NaN \n",
"9697 NaN NaN NaN \n",
"9699 NaN NaN NaN \n",
"9701 NaN NaN NaN \n",
"... ... ... ... \n",
"3066 NaN NaN NaN \n",
"5097 NaN NaN NaN \n",
"11369 NaN NaN NaN \n",
"11368 NaN NaN NaN \n",
"11362 NaN NaN NaN \n",
"\n",
" Author Full Names \n",
"0 Salucci, Marco; Arrebola, Manuel; Shan, Tao; L... \\\n",
"9714 Huang, Yu; Fu, Zuntao; Franzke, Christian L. E. \n",
"9697 Feng, De-Cheng; Cetiner, Barbaros; Kakavand, M... \n",
"9699 Zhao, Yuliang; Dong, Sheng; Jiang, Fengyuan; G... \n",
"9701 Li, Xiaohui; Yang, Dongkai; Yang, Jingsong; Zh... \n",
"... ... \n",
"3066 He, Qi; Zha, Cheng; Song, Wei; Hao, Zengzhou; ... \n",
"5097 Hasan, Md Morshadul; Popp, Jozsef; Olah, Judit \n",
"11369 Li, Yong; Cheng, Gang; Pang, Yusong; Kuai, Moshen \n",
"11368 Zeng, Rong; Rossiter, David G.; Zhang, Jiapeng... \n",
"11362 Jia, Yan; Jin, Shuanggen; Savi, Patrizia; Gao,... \n",
"\n",
" Book Author Full Names Group Authors \n",
"0 NaN NaN \\\n",
"9714 NaN NaN \n",
"9697 NaN NaN \n",
"9699 NaN NaN \n",
"9701 NaN NaN \n",
"... ... ... \n",
"3066 NaN NaN \n",
"5097 NaN NaN \n",
"11369 NaN NaN \n",
"11368 NaN NaN \n",
"11362 NaN NaN \n",
"\n",
" Article Title \n",
"0 Artificial Intelligence: New Frontiers in Real... \\\n",
"9714 Detecting causality from time series in a mach... \n",
"9697 Data-Driven Approach to Predict the Plastic Hi... \n",
"9699 System Reliability Analysis of an Offshore Jac... \n",
"9701 Analysis of coastal wind speed retrieval from ... \n",
"... ... \n",
"3066 Improved Particle Swarm Optimization for Sea S... \n",
"5097 Current landscape and influence of big data on... \n",
"11369 Planetary Gear Fault Diagnosis via Feature Ima... \n",
"11368 How Well Can Reflectance Spectroscopy Allocate... \n",
"11362 GNSS-R Soil Moisture Retrieval Based on a XGbo... \n",
"\n",
" Source Title ... \n",
"0 IEEE TRANSACTIONS ON ANTENNAS AND PROPAGATION ... \\\n",
"9714 CHAOS ... \n",
"9697 JOURNAL OF STRUCTURAL ENGINEERING ... \n",
"9699 JOURNAL OF OCEAN UNIVERSITY OF CHINA ... \n",
"9701 REMOTE SENSING OF ENVIRONMENT ... \n",
"... ... ... \n",
"3066 ENERGIES ... \n",
"5097 JOURNAL OF BIG DATA ... \n",
"11369 SENSORS ... \n",
"11368 AGRONOMY-BASEL ... \n",
"11362 REMOTE SENSING ... \n",
"\n",
" Web of Science Record issn_var issn Domain_English \n",
"0 0 issn 0018926x Applied Sciences \\\n",
"9714 0 issn 10541500 Natural Sciences \n",
"9697 0 issn 07339445 Applied Sciences \n",
"9699 0 issn 16725182 Applied Sciences \n",
"9701 0 issn 00344257 Applied Sciences \n",
"... ... ... ... ... \n",
"3066 0 eissn 19961073 Applied Sciences \n",
"5097 0 eissn 21961115 Applied Sciences \n",
"11369 0 eissn 14248220 Natural Sciences \n",
"11368 0 eissn 20734395 Natural Sciences \n",
"11362 0 eissn 20724292 Applied Sciences \n",
"\n",
" Field_English \n",
"0 Information & Communication Technologies \\\n",
"9714 Physics & Astronomy \n",
"9697 Engineering \n",
"9699 Agriculture, Fisheries & Forestry \n",
"9701 Engineering \n",
"... ... \n",
"3066 Enabling & Strategic Technologies \n",
"5097 Information & Communication Technologies \n",
"11369 Chemistry \n",
"11368 Biology \n",
"11362 Engineering \n",
"\n",
" SubField_English 2.00 SEQ \n",
"0 Networking & Telecommunications 37 \\\n",
"9714 Fluids & Plasmas 170 \n",
"9697 Civil Engineering 23 \n",
"9699 Fisheries 3 \n",
"9701 Geological & Geomatics Engineering 26 \n",
"... ... ... \n",
"3066 Energy 14 \n",
"5097 Artificial Intelligence & Image Processing 31 \n",
"11369 Analytical Chemistry 149 \n",
"11368 Plant Biology & Botany 147 \n",
"11362 Geological & Geomatics Engineering 26 \n",
"\n",
" Source_title srcid \n",
"0 IEEE Transactions on Antennas and Propagation 1.733700e+04 \\\n",
"9714 Chaos 2.743000e+04 \n",
"9697 Journal of Structural Engineering (United States) 1.630500e+04 \n",
"9699 Journal of Ocean University of China 6.100153e+09 \n",
"9701 Remote Sensing of Environment 1.250300e+04 \n",
"... ... ... \n",
"3066 Energies 6.293200e+04 \n",
"5097 Journal of Big Data 2.110079e+10 \n",
"11369 Sensors (Switzerland) 1.301240e+05 \n",
"11368 Agronomy 2.110045e+10 \n",
"11362 Remote Sensing 8.643000e+04 \n",
"\n",
" issn_type \n",
"0 issn1 \n",
"9714 issn2 \n",
"9697 issn1 \n",
"9699 issn2 \n",
"9701 issn1 \n",
"... ... \n",
"3066 issn1 \n",
"5097 issn1 \n",
"11369 issn1 \n",
"11368 issn1 \n",
"11362 issn1 \n",
"\n",
"[8592 rows x 81 columns]"
]
},
"execution_count": 97,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Domain_English', 'Field_English', 'SubField_English']"
]
},
"execution_count": 98,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"metrix_levels"
]
},
{
"cell_type": "code",
1 year ago
"execution_count": 137,
2 years ago
"metadata": {},
"outputs": [],
"source": [
1 year ago
"outdir=\"wos_processed_data\""
]
},
{
"cell_type": "code",
"execution_count": 134,
"outputs": [],
"source": [
2 years ago
"os.makedirs(outdir, exist_ok=True)\n",
"\n",
"wos.to_excel(f\"{outdir}/wos_processed.xlsx\", index=False)\n",
"\n",
"locations.drop(columns=\"Addresses\").to_excel(f\"{outdir}/wos_addresses.xlsx\", index=False)\n",
"\n",
"affiliations_merge.to_excel(f\"{outdir}/wos_affiliations.xlsx\", index=False)\n",
"\n",
"author_locations.to_excel(f\"{outdir}/wos_author_locations.xlsx\", index=False)\n",
"\n",
1 year ago
"univ_locations.to_excel(f\"{outdir}/wos_univ_locations.xlsx\", index=False)\n",
"mode_final.to_excel(f\"{outdir}/wos_univ_locations_v2.xlsx\", index=False)"
],
"metadata": {
"collapsed": false
}
2 years ago
},
1 year ago
{
"cell_type": "code",
"execution_count": 138,
"outputs": [],
"source": [
"kw_df.to_excel(f\"{outdir}/keywords.xlsx\", index=False)\n",
"wos_nlp.to_excel(f\"{outdir}/wos_nlp.xlsx\", index=False)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
2 years ago
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Domain"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Domain_English</th>\n",
" <th>UT (Unique WOS ID)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Applied Sciences</td>\n",
" <td>5379</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Natural Sciences</td>\n",
" <td>1649</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Health Sciences</td>\n",
" <td>1106</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Economic &amp; Social Sciences</td>\n",
" <td>289</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Miscellaneous</td>\n",
" <td>156</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Arts &amp; Humanities</td>\n",
" <td>13</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Domain_English UT (Unique WOS ID)\n",
"0 Applied Sciences 5379\n",
"5 Natural Sciences 1649\n",
"3 Health Sciences 1106\n",
"2 Economic & Social Sciences 289\n",
"4 Miscellaneous 156\n",
"1 Arts & Humanities 13"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"group = 'Domain_English'\n",
"data = wos.groupby(group, as_index=False)[record_col].nunique().sort_values(ascending=False, by=record_col)\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<Axes: xlabel='UT (Unique WOS ID)', ylabel='Domain_English'>"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAuIAAAGwCAYAAADsTQBeAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAABcDElEQVR4nO3de3zP9f//8ft7Yyc7M5uxmWHMbHImMYdpk0RJyGmhUiTlnJyPKYWOioxPPqSS5BTGkDM15+aQU6X4OGxGxuz1+8PP++vdNraZXszterm8Lpe9X6/n6/l6vJ6f9dn9/fR8v94WwzAMAQAAAPhX2ZldAAAAAPAgIogDAAAAJiCIAwAAACYgiAMAAAAmIIgDAAAAJiCIAwAAACYgiAMAAAAmKGR2AQCylpGRoT/++ENubm6yWCxmlwMAAHLAMAxduHBB/v7+srO79Zw3QRy4R/3xxx8KCAgwuwwAAJAHJ06cUKlSpW7ZhiAO3KPc3NwkXf8P2d3d3eRqAABATqSkpCggIMD6d/xWCOLAPerGchR3d3eCOAAA95mcLCvlw5oAAACACQjiAAAAgAkI4gAAAIAJWCMO3OMavDlX9o7OZpcBAECBsuPtzmaXwIw4AAAAYAaCOAAAAGACgjgAAABgAoI4AAAAYAKCOAAAAGACgjgAAABgAoI4AAAAYAKCOAAAAGACgjgAAABgAoI4AAAAYAKCOAAAAGACgjgAAABgAoI4AAAAYAKCOAAAAGACgjgAAABgAoI4AAAAYAKCOAAAAGACgjgAAABgAoI4AAAAYAKCODIZMWKEHnroIevr2NhYtWrV6o76TEhIkMVi0fnz5++on1vJjzoBAAD+LQTx+8ymTZtkb2+v5s2b/2vXnDJliuLi4u76dXbu3KknnnhCxYsXl5OTk4KCgtS2bVudOnUqR+f/W3UCAADkB4L4fWbGjBl65ZVXtG7dOv3xxx//yjU9PDzk6el5V69x+vRpNWnSRN7e3vrhhx+0f/9+zZw5U/7+/rp48eI9UycAAEB+IYjfR1JTU/Xll1/qpZdeUvPmzTPN/t5Y/rFkyRJFRETIyclJderU0Z49e6xt4uLi5OnpqYULF6p8+fJycnJSdHS0Tpw4ke11/7nkIyMjQ+PHj1eZMmXk7OysKlWq6Ouvv7Y5Z+nSpQoJCZGzs7MaNWqko0eP3vLeNmzYoOTkZE2fPl1Vq1ZVmTJl1KhRI7333nsqU6aMtd3evXv1+OOPy93dXW5ubqpfv74OHz6cpzpvjFd8fLxq1KghFxcXPfzww0pKSrKp7fvvv1fNmjXl5OSkYsWK6cknn7QeS0tLU79+/VSyZEkVKVJEtWvXVkJCgvX4sWPH1KJFC3l5ealIkSIKCwvT0qVLbzkWAADgwUAQv4/Mnz9fFStWVIUKFdSxY0d9/vnnMgwjU7v+/ftr0qRJ2rZtm3x8fNSiRQtdvXrVevzSpUsaO3asZs+erQ0bNuj8+fNq165djusYP368Zs+erU8++UR79+7Va6+9po4dO2rt2rWSpBMnTuipp55SixYtlJiYqO7du2vQoEG37NPPz0/p6en69ttvs7wnSfr999/VoEEDOTo6avXq1dqxY4e6du2q9PT0PNV5w5AhQzRp0iRt375dhQoVUteuXa3HlixZoieffFKPPfaYfv75Z8XHx6tWrVrW47169dKmTZs0b9487dq1S23atFFMTIwOHjwoSerZs6fS0tK0bt067d69W2+99ZZcXV2zrDctLU0pKSk2GwAAKLgKmV0Acm7GjBnq2LGjJCkmJkbJyclau3atGjZsaNNu+PDhatq0qSRp1qxZKlWqlL799ls988wzkqSrV6/qgw8+UO3ata1tQkNDtXXrVpuQmZW0tDSNGzdOq1atUt26dSVJwcHB+vHHHzVt2jRFRkbq448/VtmyZTVp0iRJUoUKFawhNDt16tTRG2+8oWeffVY9evRQrVq11LhxY3Xu3Fm+vr6SpA8//FAeHh6aN2+eChcuLEkKCQnJc503jB071vp60KBBat68uS5fviwnJyeNHTtW7dq108iRI63tq1SpIkk6fvy4Zs6cqePHj8vf31+S1K9fPy1fvlwzZ87UuHHjdPz4cbVu3Vrh4eHWGrIzfvx4m+sAAICCjRnx+0RSUpK2bt2q9u3bS5IKFSqktm3basaMGZna3giekuTt7a0KFSpo//791n2FChVSzZo1ra8rVqwoT09PmzbZOXTokC5duqSmTZvK1dXVus2ePdu6RGT//v3WkJ9VTdkZO3as/vzzT33yyScKCwvTJ598oooVK2r37t2SpMTERNWvX98awu+0zhsiIiKsP5coUUKSrB8QTUxMVJMmTbK8xu7du3Xt2jWFhITYXGPt2rXWa/Tu3VtjxoxRvXr1NHz4cO3atSvbmgcPHqzk5GTrdqvlQgAA4P7HjPh9YsaMGUpPT7fOvEqSYRhydHTUBx98IA8Pj3+ljtTUVEnXl2yULFnS5pijo+Md91+0aFG1adNGbdq00bhx41S1alW98847mjVrlpydne9KnTcHe4vFIun6+nJJt7xmamqq7O3ttWPHDtnb29scu7H8pHv37oqOjtaSJUu0YsUKjR8/XpMmTdIrr7ySqT9HR8d8GUMAAHB/YEb8PpCenq7Zs2dr0qRJSkxMtG47d+6Uv7+/5s6da9N+8+bN1p/PnTunAwcOKDQ01Ka/7du3W18nJSXp/PnzNm2yU6lSJTk6Our48eMqV66czRYQECBJ1mUu2dWUUw4ODipbtqz1qSkRERFav369zXr3O6kzJyIiIhQfH5/lsapVq+ratWs6depUpmv4+flZ2wUEBKhHjx5asGCB+vbtq88++yzH1wcAAAUXM+L3gcWLF+vcuXPq1q1bppnv1q1ba8aMGerRo4d136hRo1S0aFH5+vpqyJAhKlasmM3TRAoXLqxXXnlFU6dOVaFChdSrVy/VqVPntuvDJcnNzU39+vXTa6+9poyMDD3yyCNKTk7Whg0b5O7uri5duqhHjx6aNGmS+vfvr+7du2vHjh23fb734sWLNW/ePLVr104hISEyDEPff/+9li5dqpkzZ0q6/sHI999/X+3atdPgwYPl4eGhzZs3q1atWqpQoUKu68yJ4cOHq0mTJipbtqzatWun9PR0LV26VAMHDlRISIg6dOigzp07a9KkSapatapOnz6t+Ph4RUREqHnz5urTp4+aNWumkJAQnTt3TmvWrMnRGx4AAFDwMSN+H5gxY4aioqKyXH7SunVrbd++3Wbt8YQJE/Tqq6+qevXq+vPPP/X999/LwcHBetzFxUUDBw7Us88+q3r16snV1VVffvlljusZPXq0hg4dqvHjxys0NFQxMTFasmSJ9TGDgYGB+uabb7Rw4UJVqVJFn3zyicaNG3fLPitVqiQXFxf17dtXDz30kOrUqaP58+dr+vTp6tSpk6Try1ZWr16t1NRURUZGqnr16vrss8+yXTN+uzpzomHDhvrqq6+0aNEiPfTQQ2rcuLHNbP/MmTPVuXNn9e3bVxUqVFCrVq20bds2BQYGSpKuXbumnj17Wq8fEhKijz76KMfXBwAABZfFyO5ZcbjvJCQkqFGjRjp37ly2X2wTFxenPn363NWvmkf+SElJkYeHh6q88onsHXO+Ph4AANzejrc735V+b/z9Tk5Olru7+y3bMiMOAAAAmIAgDgAAAJiAIF6ANGzYUIZhZLssRbr+NfAsSwEAADAfQRwAAAAwAUEcAAAAMAFBHAAAADABQRwAAAAwAUEcAAAAMAFBHAAAADABQRwAAAAwAUEcAAAAMAFBHAAAADABQRwAAAAwAUEcAAAAMAFBHAAAADABQRwAAAAwAUEcAAAAMAFBHAAAADBBIbMLAHBr68a0l7u7u9llAACAfMaMOAAAAGACgjgAAABgAoI4AAAAYAKCOAAAAGACgjgAAABgAoI4AAAAYAKCOAAAAGACgjgAAABgAoI4AAAAYAKCOAAAAGACgjgAAABggkJmFwDg1k5MqCM3J3uzy8B9KnDYbrNLAABkgxlxAAAAwAQEcQAAAMAEBHEAAADABARxAAAAwAQ
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.barplot(data, x=record_col, y=group)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"# group = ['Publication Year','Domain_English']\n",
"# data = wos.groupby(group, as_index=False)[record_col].nunique().sort_values(ascending=False, by=group+[record_col])\n",
"# data"
]
},
{
"cell_type": "code",
1 year ago
"execution_count": 131,
2 years ago
"metadata": {},
1 year ago
"outputs": [],
2 years ago
"source": [
1 year ago
"# group = ['Publication Year','Domain_English']\n",
"# data = wos.groupby(group)[record_col].nunique().unstack(fill_value=0).stack().reset_index().rename(columns={0:record_col}).sort_values(ascending=False, by=group+[record_col])\n",
"# data"
2 years ago
]
},
{
"cell_type": "code",
1 year ago
"execution_count": 130,
2 years ago
"metadata": {},
1 year ago
"outputs": [],
2 years ago
"source": [
1 year ago
"# g=sns.lineplot(data.sort_values(ascending=True, by=group[-1]),y=record_col,x=group[0], hue=group[-1])\n",
"# g.set(xticks=list(range(2012,2022+1,2)))\n",
"# g.legend(title=None)"
2 years ago
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Field"
]
},
{
"cell_type": "code",
1 year ago
"execution_count": 129,
2 years ago
"metadata": {},
1 year ago
"outputs": [],
2 years ago
"source": [
1 year ago
"# group = ['Publication Year',\"Domain_English\",'Field_English']\n",
"# data = wos.groupby(group, as_index=False)[record_col].nunique().sort_values(ascending=False, by=group+[record_col])\n",
"# data"
2 years ago
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"# g = sns.FacetGrid(data, col=\"Domain_English\", col_wrap=3, height=5)\n",
"# g.map_dataframe(sns.lineplot,x=group[0],y=record_col,hue=group[-1])\n",
"# g.set_titles(col_template=\"{col_name}\")\n",
"# g.set(xticks=list(range(2012,2022+1,2)))\n",
"# # g.add_legend()"
]
},
{
"cell_type": "code",
1 year ago
"execution_count": 126,
2 years ago
"metadata": {},
1 year ago
"outputs": [],
2 years ago
"source": [
1 year ago
"# import matplotlib.pyplot as plt\n",
"# for cat in sorted(data[group[-2]].unique()):\n",
"# sub_data = data[data[group[-2]]==cat]\n",
"# sub_data = sub_data.complete({group[0]:range(int(data[group[0]].min()), int(data[group[0]].max()) + 1)}\n",
"# ,group[-1],fill_value=0)\n",
"# g=sns.lineplot(sub_data.sort_values(ascending=True, by=group[-1]),y=record_col,x=group[0], hue=group[-1])\n",
"# g.set(xticks=list(range(2012,2022+1,2)))\n",
"# g.legend(title=None)\n",
"# g.set_title(cat)\n",
"# g.yaxis.set_major_locator(MaxNLocator(integer=True))\n",
"# plt.show()"
2 years ago
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# SubField"
]
},
{
"cell_type": "code",
1 year ago
"execution_count": 128,
2 years ago
"metadata": {},
1 year ago
"outputs": [],
2 years ago
"source": [
1 year ago
"# group = ['Publication Year',\"Domain_English\",'Field_English',\"SubField_English\"]\n",
"# data = wos.groupby(group, as_index=False)[record_col].nunique().sort_values(ascending=False, by=group+[record_col])\n",
"# data"
2 years ago
]
},
{
"cell_type": "code",
1 year ago
"execution_count": 127,
2 years ago
"metadata": {},
1 year ago
"outputs": [],
2 years ago
"source": [
1 year ago
"# import matplotlib.pyplot as plt\n",
"# for cat in sorted(data[group[-2]].unique()):\n",
"# sub_data = data[data[group[-2]]==cat]\n",
"# sub_data = sub_data.complete({group[0]:range(int(data[group[0]].min()), int(data[group[0]].max()) + 1)}\n",
"# ,group[-1],fill_value=0)\n",
"# g=sns.lineplot(sub_data.sort_values(ascending=True, by=group[-1]),y=record_col,x=group[0], hue=group[-1])\n",
"# g.set(xticks=list(range(2012,2022+1,2)))\n",
"# g.legend(title=None,bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., ncols=math.ceil(len(g.legend_.texts)/12))\n",
"# g.set_title(cat)\n",
"# plt.show()"
2 years ago
]
2 years ago
}
],
"metadata": {
"kernelspec": {
2 years ago
"display_name": "Python 3 (ipykernel)",
2 years ago
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
2 years ago
"version": 3
2 years ago
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
2 years ago
"pygments_lexer": "ipython3",
"version": "3.9.16"
2 years ago
}
},
"nbformat": 4,
2 years ago
"nbformat_minor": 1
1 year ago
}