ZSI_Reconnect_China/WOS/wos_processing_pipeline.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import os\n",
    "import shutil\n",
    "from flashgeotext.geotext import GeoText\n",
    "import re"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "outputs": [],
   "source": [
    "import hashlib\n",
    "\n",
    "def md5hash(s: str):\n",
    "    return hashlib.md5(s.encode('utf-8')).hexdigest()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "record_col=\"UT (Unique WOS ID)\"\n",
    "outfile = r\"C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\WOS\\wos_extract\\wos_records_concat.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of initial (valid interval) records: 56196\n",
      "Number of METRIX filtered records: 49854\n",
      "Number of unindexed records: 2984\n",
      "Number of filtered records (dropping duplicates): 49839\n"
     ]
    }
   ],
   "source": [
    "wos = pd.read_csv(outfile, sep=\"\\t\",low_memory=False)\n",
    "\n",
    "wos = wos[((wos[\"Publication Year\"]<2023)&(wos[\"Publication Year\"]>2010))].copy()\n",
    "print(f'Number of initial (valid interval) records: {len(wos)}')\n",
    "\n",
    "metrix = pd.read_excel(\"sm_journal_classification.xlsx\", sheet_name=\"Journal_Classification\")\n",
    "\n",
    "\n",
    "metrix = metrix.set_index([c for c in metrix.columns if \"issn\" not in c]).stack().reset_index()\n",
    "metrix = metrix.rename(columns={'level_6':\"issn_type\", 0:\"issn\"})\n",
    "metrix[\"issn\"]=metrix[\"issn\"].str.replace(\"-\",\"\").str.lower().str.strip()\n",
    "\n",
    "wos[\"issn\"] = wos[\"ISSN\"].str.replace(\"-\",\"\").str.lower().str.strip()\n",
    "wos[\"eissn\"] = wos[\"eISSN\"].str.replace(\"-\",\"\").str.lower().str.strip()\n",
    "wos = wos.set_index([c for c in wos.columns if \"issn\" not in c]).stack().reset_index()\n",
    "wos = wos.rename(columns={'level_71':\"issn_var\", 0:\"issn\"})\n",
    "\n",
    "wos_merge = wos.merge(metrix, on=\"issn\", how=\"left\")\n",
    "\n",
    "\n",
    "\n",
    "wos_indexed = wos_merge[~wos_merge[\"Domain_English\"].isna()]\n",
    "wos_unindexed = wos_merge[~wos_merge[record_col].isin(wos_indexed[record_col])]\n",
    "\n",
    "\n",
    "wos_unindexed = wos_unindexed.sort_values(by=[\"issn_var\"],ascending=False).drop_duplicates(subset=record_col)\n",
    "wos = wos_indexed.sort_values(by=[\"issn_var\"],ascending=False).drop_duplicates(subset=record_col)\n",
    "\n",
    "wos_postmerge = wos.copy()\n",
    "print(f'Number of METRIX filtered records: {len(wos)}')\n",
    "print(f'Number of unindexed records: {len(wos_unindexed)}')\n",
    "\n",
    "# drop entries not indexed by metrix\n",
    "# drop duplicates (based on doi)\n",
    "wos = wos[~((~wos[\"DOI\"].isna())&(wos[\"DOI\"].duplicated(False)))]\n",
    "wos = wos.drop_duplicates(subset=[\"Publication Type\",\"Document Type\",\"Authors\",\"Article Title\",\"Source Title\",\"Publication Year\"])\n",
    "print(f'Number of filtered records (dropping duplicates): {len(wos)}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "outputs": [
    {
     "data": {
      "text/plain": "Domain_English\nApplied Sciences                31871\nNatural Sciences                 9542\nHealth Sciences                  5942\nEconomic & Social Sciences       1468\narticle-level classification      940\nArts & Humanities                  76\nName: count, dtype: int64"
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wos[\"Domain_English\"].value_counts()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "outputs": [],
   "source": [
    "wos_classifier = wos[[\"WoS Categories\",\"Research Areas\"]+list(metrix.columns)].copy().drop_duplicates()\n",
    "wos_classifier = wos_classifier.groupby([\"WoS Categories\",\"Research Areas\"], as_index=False)[[\"Domain_English\",\"Field_English\",\"SubField_English\"]].agg(\n",
    "    lambda x: pd.Series.mode(x)[0])"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found: 2065 \n",
      "Lost forever: 919\n"
     ]
    }
   ],
   "source": [
    "wos_to_reindex = wos_unindexed.drop(columns=list(metrix.columns))\n",
    "wos_found = wos_to_reindex.merge(wos_classifier, on=[\"WoS Categories\",\"Research Areas\"], how=\"inner\")\n",
    "# wos_found = wos_to_reindex.merge(wos_classifier, on=\"Research Areas\", how=\"inner\")\n",
    "# # wos_found = wos_to_reindex.merge(wos_classifier, on=\"WoS Categories\", how=\"inner\")\n",
    "wos_stillost = wos_unindexed[~wos_unindexed[record_col].isin(wos_found[record_col])]\n",
    "\n",
    "print(\"Found:\", wos_found[record_col].nunique(),\"\\nLost forever:\", wos_stillost[record_col].nunique())"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of records (after remerge): 51904\n"
     ]
    }
   ],
   "source": [
    "wos = pd.concat([wos,wos_found], ignore_index=True)\n",
    "print(f'Number of records (after remerge): {len(wos)}')"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "outputs": [
    {
     "data": {
      "text/plain": "Domain_English\nApplied Sciences                33720\nNatural Sciences                 9617\nHealth Sciences                  6002\nEconomic & Social Sciences       1533\narticle-level classification      955\nArts & Humanities                  77\nName: count, dtype: int64"
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wos[\"Domain_English\"].value_counts()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "outputs": [
    {
     "data": {
      "text/plain": "WoS Categories\nEngineering, Electrical & Electronic         13661\nComputer Science, Artificial Intelligence     7760\nComputer Science, Information Systems         6481\nTelecommunications                            5560\nComputer Science, Theory & Methods            3597\n                                             ...  \nMusic                                            1\nCultural Studies                                 1\nPsychology, Psychoanalysis                       1\nAsian Studies                                    1\nAndrology                                        1\nName: count, Length: 236, dtype: int64"
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wos_cat = wos.groupby(record_col)[\"WoS Categories\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
    "wos_cat[\"WoS Categories\"] = wos_cat[\"WoS Categories\"].str.strip()\n",
    "wos_cat[\"WoS Categories\"].value_counts()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "outputs": [
    {
     "data": {
      "text/plain": "WoS Category\nEngineering                                  20126\nComputer Science                             17613\nTelecommunications                            5560\nImaging Science & Photographic Technology     3295\nAutomation & Control Systems                  3232\n                                             ...  \nMusic                                            1\nAndrology                                        1\nLiterature                                       1\nCultural Studies                                 1\nAsian Studies                                    1\nName: count, Length: 177, dtype: int64"
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wos_subcat = wos_cat.copy()\n",
    "wos_subcat[['WoS Category', 'WoS SubCategory']] = wos_subcat[\"WoS Categories\"].str.split(\",\", expand = True, n=1)\n",
    "for c in ['WoS Category', 'WoS SubCategory',\"WoS Categories\"]:\n",
    "    wos_subcat[c] = wos_subcat[c].str.strip()\n",
    "wos_subcat.drop_duplicates(subset=[record_col,'WoS Category'])[\"WoS Category\"].value_counts()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "outputs": [
    {
     "data": {
      "text/plain": "Research Areas\nEngineering                                  20176\nComputer Science                             17613\nTelecommunications                            5560\nEnvironmental Sciences & Ecology              3732\nImaging Science & Photographic Technology     3295\n                                             ...  \nLiterature                                       1\nWomen's Studies                                  1\nCultural Studies                                 1\nAsian Studies                                    1\nMusic                                            1\nName: count, Length: 147, dtype: int64"
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wos_areas = wos.groupby(record_col)[\"Research Areas\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
    "wos_areas[\"Research Areas\"] = wos_areas[\"Research Areas\"].str.strip()\n",
    "wos_areas[\"Research Areas\"].value_counts()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "outputs": [
    {
     "data": {
      "text/plain": "                                           Article Title   \n24862  Kinematic self-calibration of non-contact five...  \\\n6623   Optimizing Color Assignment for Perception of ...   \n20728  CFD modeling of biomass combustion and gasific...   \n41245         Redshift-space distortions in f(R) gravity   \n12373  Executable Knowledge Graphs for Machine Learni...   \n...                                                  ...   \n11117  Biochar amendment mitigated N2O emissions from...   \n47975  Adaptive Noise Reduction for Sound Event Detec...   \n4599   NVM Storage in IoT Devices: Opportunities and ...   \n40609  FABNet: Fusion Attention Block and Transfer Le...   \n45199  Tea Category Identification Using a Novel Frac...   \n\n                                           Keywords Plus   \n24862            POSE MEASUREMENT; PARALLEL; MANIPULATOR  \\\n6623                            OPTIMIZATION; DIFFERENCE   \n20728  DISCRETE PARTICLE SIMULATION; STEAM GASIFICATI...   \n41245  DARK ENERGY SURVEY; REAL-SPACE; GROWTH; DENSIT...   \n12373                                                NaN   \n...                                                  ...   \n11117  NITROUS-OXIDE EMISSIONS; NITRIFIER DENITRIFICA...   \n47975  NONNEGATIVE MATRIX FACTORIZATION; SOURCE SEPAR...   \n4599   ENCRYPTED DATA; ACCESS-CONTROL; INTERNET; MEMO...   \n40609                                             NUCLEI   \n45199  LEARNING-BASED OPTIMIZATION; PATHOLOGICAL BRAI...   \n\n                                         Author Keywords  \n24862  kinematic self-calibration; five-axis measurin...  \n6623       Color perception; visual design; scatterplots  \n20728  Biomass combustion and gasification; CFD simul...  \n41245  cosmology: theory; dark energy; large-scale st...  \n12373  Knowledge graph; Machine learning; Data analyt...  \n...                                                  ...  \n11117  Biochar; Nitrite accumulation; Nitrous oxide; ...  \n47975  sound event detection; non-stationary noise; w...  \n4599   IoT; NVM; storage system; energy efficiency; s...  \n40609  Cancer; Analytical models; Transfer learning; ...  \n45199  tea-category identification; fractional Fourie...  \n\n[100 rows x 3 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Article Title</th>\n      <th>Keywords Plus</th>\n      <th>Author Keywords</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>24862</th>\n      <td>Kinematic self-calibration of non-contact five...</td>\n      <td>POSE MEASUREMENT; PARALLEL; MANIPULATOR</td>\n      <td>kinematic self-calibration; five-axis measurin...</td>\n    </tr>\n    <tr>\n      <th>6623</th>\n      <td>Optimizing Color Assignment for Perception of ...</td>\n      <td>OPTIMIZATION; DIFFERENCE</td>\n      <td>Color perception; visual design; scatterplots</td>\n    </tr>\n    <tr>\n      <th>20728</th>\n      <td>CFD modeling of biomass combustion and gasific...</td>\n      <td>DISCRETE PARTICLE SIMULATION; STEAM GASIFICATI...</td>\n      <td>Biomass combustion and gasification; CFD simul...</td>\n    </tr>\n    <tr>\n      <th>41245</th>\n      <td>Redshift-space distortions in f(R) gravity</td>\n      <td>DARK ENERGY SURVEY; REAL-SPACE; GROWTH; DENSIT...</td>\n      <td>cosmology: theory; dark energy; large-scale st...</td>\n    </tr>\n    <tr>\n      <th>12373</th>\n      <td>Executable Knowledge Graphs for Machine Learni...</td>\n      <td>NaN</td>\n      <td>Knowledge graph; Machine learning; Data analyt...</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>11117</th>\n      <td>Biochar amendment mitigated N2O emissions from...</td>\n      <td>NITROUS-OXIDE EMISSIONS; NITRIFIER DENITRIFICA...</td>\n      <td>Biochar; Nitrite accumulation; Nitrous oxide; ...</td>\n    </tr>\n    <tr>\n      <th>47975</th>\n      <td>Adaptive Noise Reduction for Sound Event Detec...</td>\n      <td>NONNEGATIVE MATRIX FACTORIZATION; SOURCE SEPAR...</td>\n      <td>sound event detection; non-stationary noise; w...</td>\n    </tr>\n    <tr>\n      <th>4599</th>\n      <td>NVM Storage in IoT Devices: Opportunities and ...</td>\n      <td>ENCRYPTED DATA; ACCESS-CONTROL; INTERNET; MEMO...</td>\n      <td>IoT; NVM; storage system; energy efficiency; s...</td>\n    </tr>\n    <tr>\n      <th>40609</th>\n      <td>FABNet: Fusion Attention Block and Transfer Le...</td>\n      <td>NUCLEI</td>\n      <td>Cancer; Analytical models; Transfer learning; ...</td>\n    </tr>\n    <tr>\n      <th>45199</th>\n      <td>Tea Category Identification Using a Novel Frac...</td>\n      <td>LEARNING-BASED OPTIMIZATION; PATHOLOGICAL BRAI...</td>\n      <td>tea-category identification; fractional Fourie...</td>\n    </tr>\n  </tbody>\n</table>\n<p>100 rows × 3 columns</p>\n</div>"
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wos[[\"Article Title\",\"Keywords Plus\",\"Author Keywords\"]].sample(100)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "outputs": [
    {
     "data": {
      "text/plain": "      UT (Unique WOS ID)                   keyword_all\n0    WOS:000208837000001               NANOINDENTATION\n1    WOS:000208837000001                      HARDNESS\n2    WOS:000208837000001        PLASMA-SPRAYED COATING\n3    WOS:000208837000001              INVERSE ANALYSIS\n4    WOS:000208837000001              NUMERICAL METHOD\n..                   ...                           ...\n97   WOS:000209571700012         PERSONALIZED MEDICINE\n98   WOS:000209571700012               COMPLEX NETWORK\n99   WOS:000209571700012    CLINICAL PHENOTYPE NETWORK\n100  WOS:000209571700012  TRADITIONAL CHINESE MEDICINE\n101  WOS:000209617200002                PHYLLOSCOPIDAE\n\n[100 rows x 2 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>UT (Unique WOS ID)</th>\n      <th>keyword_all</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>WOS:000208837000001</td>\n      <td>NANOINDENTATION</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>WOS:000208837000001</td>\n      <td>HARDNESS</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>WOS:000208837000001</td>\n      <td>PLASMA-SPRAYED COATING</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>WOS:000208837000001</td>\n      <td>INVERSE ANALYSIS</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>WOS:000208837000001</td>\n      <td>NUMERICAL METHOD</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>97</th>\n      <td>WOS:000209571700012</td>\n      <td>PERSONALIZED MEDICINE</td>\n    </tr>\n    <tr>\n      <th>98</th>\n      <td>WOS:000209571700012</td>\n      <td>COMPLEX NETWORK</td>\n    </tr>\n    <tr>\n      <th>99</th>\n      <td>WOS:000209571700012</td>\n      <td>CLINICAL PHENOTYPE NETWORK</td>\n    </tr>\n    <tr>\n      <th>100</th>\n      <td>WOS:000209571700012</td>\n      <td>TRADITIONAL CHINESE MEDICINE</td>\n    </tr>\n    <tr>\n      <th>101</th>\n      <td>WOS:000209617200002</td>\n      <td>PHYLLOSCOPIDAE</td>\n    </tr>\n  </tbody>\n</table>\n<p>100 rows × 2 columns</p>\n</div>"
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kw_df = pd.DataFrame()\n",
    "for c in [\"Keywords Plus\",\"Author Keywords\"]:\n",
    "    kwp = wos.groupby(record_col)[c].apply(lambda x: x.str.split(';')).explode().str.strip().str.upper()\n",
    "    kwp.name = 'keyword_all'\n",
    "    kw_df = pd.concat([kwp.reset_index(),kw_df],ignore_index=True)\n",
    "kw_df = kw_df[~kw_df[\"keyword_all\"].isna()].copy().drop(columns=\"level_1\").drop_duplicates()\n",
    "kw_df[\"keyword_all\"] = kw_df[\"keyword_all\"].apply(lambda x: re.sub(\"[\\(\\[].*?[\\)\\]]\", \"\", x))\n",
    "kw_df.head(100)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "outputs": [
    {
     "data": {
      "text/plain": "    UT (Unique WOS ID)                                        keyword_all\n0  WOS:000208837000001  NANOINDENTATION; HARDNESS; PLASMA-SPRAYED COAT...\n1  WOS:000208863600013  COMPARATIVE GENOMICS; ANAMMOX; KUENENIA STUTTG...\n2  WOS:000208863600266  ANME; PYROSEQUENCING; AOM; COMMUNITY STRUCTURE...\n3  WOS:000208863900217  DEFAULT MODE NETWORK; EFFECTIVE CONNECTIVITY; ...\n4  WOS:000208935500007  ESTIMATION; SOURCE TERM; QUASI STEADY; THE ITE...",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>UT (Unique WOS ID)</th>\n      <th>keyword_all</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>WOS:000208837000001</td>\n      <td>NANOINDENTATION; HARDNESS; PLASMA-SPRAYED COAT...</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>WOS:000208863600013</td>\n      <td>COMPARATIVE GENOMICS; ANAMMOX; KUENENIA STUTTG...</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>WOS:000208863600266</td>\n      <td>ANME; PYROSEQUENCING; AOM; COMMUNITY STRUCTURE...</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>WOS:000208863900217</td>\n      <td>DEFAULT MODE NETWORK; EFFECTIVE CONNECTIVITY; ...</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>WOS:000208935500007</td>\n      <td>ESTIMATION; SOURCE TERM; QUASI STEADY; THE ITE...</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wos_kwd_concat = kw_df.groupby(record_col, as_index=False).agg({'keyword_all': '; '.join})\n",
    "wos_kwd_concat.head()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "outputs": [
    {
     "data": {
      "text/plain": "Index(['Publication Type', 'Authors', 'Book Authors', 'Book Editors',\n       'Book Group Authors', 'Author Full Names', 'Book Author Full Names',\n       'Group Authors', 'Article Title', 'Source Title', 'Book Series Title',\n       'Book Series Subtitle', 'Language', 'Document Type', 'Conference Title',\n       'Conference Date', 'Conference Location', 'Conference Sponsor',\n       'Conference Host', 'Author Keywords', 'Keywords Plus', 'Abstract',\n       'Addresses', 'Affiliations', 'Reprint Addresses', 'Email Addresses',\n       'Researcher Ids', 'ORCIDs', 'Funding Orgs', 'Funding Name Preferred',\n       'Funding Text', 'Cited References', 'Cited Reference Count',\n       'Times Cited, WoS Core', 'Times Cited, All Databases',\n       '180 Day Usage Count', 'Since 2013 Usage Count', 'Publisher',\n       'Publisher City', 'Publisher Address', 'ISSN', 'eISSN', 'ISBN',\n       'Journal Abbreviation', 'Journal ISO Abbreviation', 'Publication Date',\n       'Publication Year', 'Volume', 'Issue', 'Part Number', 'Supplement',\n       'Special Issue', 'Meeting Abstract', 'Start Page', 'End Page',\n       'Article Number', 'DOI', 'DOI Link', 'Book DOI', 'Early Access Date',\n       'Number of Pages', 'WoS Categories', 'Web of Science Index',\n       'Research Areas', 'IDS Number', 'Pubmed Id', 'Open Access Designations',\n       'Highly Cited Status', 'Hot Paper Status', 'Date of Export',\n       'UT (Unique WOS ID)', 'issn_var', 'issn', 'Domain_English',\n       'Field_English', 'SubField_English', '2.00 SEQ', 'Source_title',\n       'srcid', 'issn_type'],\n      dtype='object')"
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wos.columns"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "geotext = GeoText()\n",
    "\n",
    "def extract_location(input_text, key='countries'):\n",
    "    anomalies = {\"Malta\":\"Malta\",\n",
    "                 \"Mongolia\":\"Mongolia\",\n",
    "                 \"Quatar\":\"Qatar\",\n",
    "                 \"Qatar\":\"Qatar\",\n",
    "                 \"Ethiop\":\"Ethiopia\",\n",
    "                 \"Nigeria\":\"Nigeria\",\n",
    "                 \"BELAR\":\"Belarus\",\n",
    "                 \"Venezuela\":\"Venezuela\",\n",
    "                 \"Cyprus\":\"Cyprus\",\n",
    "                 \"Ecuador\":\"Ecuador\",\n",
    "                 \"U Arab\":\"United Arab Emirates\",\n",
    "                 \"Syria\":\"Syria\",\n",
    "                 \"Uganda\":\"Uganda\",\n",
    "                 \"Yemen\":\"Yemen\",\n",
    "                 \"Mali\":\"Mali\",\n",
    "                 \"Senegal\":\"Senegal\",\n",
    "                 \"Vatican\":\"Vatican\",\n",
    "                 \"Uruguay\":\"Uruguay\",\n",
    "                 \"Panama\":\"Panama\",\n",
    "                 \"Fiji\":\"Fiji\",\n",
    "                 \"Faroe\":\"Faroe Islands\",\n",
    "                 \"Macedonia\":\"Macedonia\",\n",
    "                 'Mozambique':'Mozambique',\n",
    "                 \"Kuwait\":\"Kuwait\",\n",
    "                 \"Libya\":\"Libya\",\n",
    "                 \"Turkiy\":\"Turkey\",\n",
    "                 \"Liberia\":\"Liberia\",\n",
    "                 \"Namibia\":\"Namibia\",\n",
    "                 \"Ivoire\":\"Ivory Coast\",\n",
    "                 \"Guatemala\":\"Gutemala\",\n",
    "                 \"Paraguay\":\"Paraguay\",\n",
    "                 \"Honduras\":\"Honduras\",\n",
    "                 \"Nicaragua\":\"Nicaragua\",\n",
    "                 \"Trinidad\":\"Trinidad & Tobago\",\n",
    "                 \"Liechtenstein\":\"Liechtenstein\",\n",
    "                 \"Greenland\":\"Denmark\"}\n",
    "\n",
    "    extracted = geotext.extract(input_text=input_text)\n",
    "    found = extracted[key].keys()\n",
    "    if len(sorted(found))>0:\n",
    "        return sorted(found)[0]\n",
    "    elif key=='countries':\n",
    "        for i  in ['Scotland','Wales','England', 'N Ireland']:\n",
    "            if i in input_text:\n",
    "                return 'United Kingdom'\n",
    "        for j in anomalies.keys():\n",
    "            if j in input_text:\n",
    "                return anomalies.get(j)\n",
    "    else:\n",
    "        return None\n",
    "\n",
    "with open('../eu_members.txt',\"r\") as f:\n",
    "    eu_countries=f.readline().split(\",\")\n",
    "    eu_countries=[i.strip() for i in eu_countries]\n",
    "\n",
    "def country_cleanup(country):\n",
    "    if \"USA\" in country:\n",
    "        return \"USA\"\n",
    "    elif \"China\" in country:\n",
    "        return \"China\"\n",
    "    elif country in [\"England\", \"Northern Ireland\", \"Wales\", \"Scotland\",\"N Ireland\"]:\n",
    "        return \"United Kingdom\"\n",
    "    else:\n",
    "        return country\n",
    "\n",
    "\n",
    "def country_type(country):\n",
    "    if country in eu_countries:\n",
    "        return \"EU\"\n",
    "    elif country==\"China\":\n",
    "        return \"China\"\n",
    "    elif country in [\"Switzerland\", 'Norway','United Kingdom']:\n",
    "        return \"Non-EU associate\"\n",
    "    else:\n",
    "        return \"Other\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "locations = wos.groupby(record_col)[\"Addresses\"].apply(lambda x: x.str.split('[')).explode().reset_index().drop(columns=\"level_1\")\n",
    "\n",
    "\n",
    "locations = locations[locations[\"Addresses\"]!=\"\"].copy()\n",
    "locations[\"Address\"] = locations[\"Addresses\"].apply(lambda x:x.split(\"]\")[-1])\n",
    "locations[\"Authors_of_address\"] = locations[\"Addresses\"].apply(lambda x:x.split(\"]\")[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "outputs": [
    {
     "data": {
      "text/plain": "312820"
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(locations)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "outputs": [
    {
     "data": {
      "text/plain": "     UT (Unique WOS ID)                                 Authors_of_address   \n0   WOS:000208837000001                                Gitzhofer, Francois  \\\n1   WOS:000208837000001  Guo, Wei-Chao; Papeleux, Luc; Ponthot, Jean-Ph...   \n2   WOS:000208837000001                     Guo, Wei-Chao; Zhang, Wei-Hong   \n3   WOS:000208837000001                                       Rauchs, Gast   \n4   WOS:000208863600013                                         Hu, Baolan   \n..                  ...                                                ...   \n95  WOS:000209546000001                                  Salahuddin, Nawal   \n96  WOS:000209546000001                                Shrestha, Babu Raja   \n97  WOS:000209546000001                                   Tan, Cheng Cheng   \n98  WOS:000209546000001                                     Tang, Yao-Qing   \n99  WOS:000209546000001                                       Tu, Mei-Lien   \n\n                                              Address  \n0   Univ Sherbrooke, Dept Chem Engn, Plasma Techno...  \n1   Univ Liege, Aerosp & Mech Engn Dept, LTAS MN2L...  \n2   Northwestern Polytech Univ, Key Lab Contempora...  \n3   Ctr Rech Publ Henri Tudor, Dept Adv Mat & Stru...  \n4   Zhejiang Univ, Dept Environm Engn, Hangzhou 31...  \n..                                                ...  \n95  Aga Khan Univ & Hosp, Dept Med, Pulm & Crit Ca...  \n96  Kathmandu Med Coll Teaching Hosp, Dept Anesthe...  \n97  Sultanah Aminah Hosp, Dept Anaesthesia & Inten...  \n98  Shanghai Jiao Tong Univ, Sch Med, Ruijin Hosp,...  \n99  Chang Gung Mem Hosp, Kaohsiung Med Ctr, Dept R...  \n\n[100 rows x 3 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>UT (Unique WOS ID)</th>\n      <th>Authors_of_address</th>\n      <th>Address</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>WOS:000208837000001</td>\n      <td>Gitzhofer, Francois</td>\n      <td>Univ Sherbrooke, Dept Chem Engn, Plasma Techno...</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>WOS:000208837000001</td>\n      <td>Guo, Wei-Chao; Papeleux, Luc; Ponthot, Jean-Ph...</td>\n      <td>Univ Liege, Aerosp &amp; Mech Engn Dept, LTAS MN2L...</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>WOS:000208837000001</td>\n      <td>Guo, Wei-Chao; Zhang, Wei-Hong</td>\n      <td>Northwestern Polytech Univ, Key Lab Contempora...</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>WOS:000208837000001</td>\n      <td>Rauchs, Gast</td>\n      <td>Ctr Rech Publ Henri Tudor, Dept Adv Mat &amp; Stru...</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>WOS:000208863600013</td>\n      <td>Hu, Baolan</td>\n      <td>Zhejiang Univ, Dept Environm Engn, Hangzhou 31...</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>95</th>\n      <td>WOS:000209546000001</td>\n      <td>Salahuddin, Nawal</td>\n      <td>Aga Khan Univ &amp; Hosp, Dept Med, Pulm &amp; Crit Ca...</td>\n    </tr>\n    <tr>\n      <th>96</th>\n      <td>WOS:000209546000001</td>\n      <td>Shrestha, Babu Raja</td>\n      <td>Kathmandu Med Coll Teaching Hosp, Dept Anesthe...</td>\n    </tr>\n    <tr>\n      <th>97</th>\n      <td>WOS:000209546000001</td>\n      <td>Tan, Cheng Cheng</td>\n      <td>Sultanah Aminah Hosp, Dept Anaesthesia &amp; Inten...</td>\n    </tr>\n    <tr>\n      <th>98</th>\n      <td>WOS:000209546000001</td>\n      <td>Tang, Yao-Qing</td>\n      <td>Shanghai Jiao Tong Univ, Sch Med, Ruijin Hosp,...</td>\n    </tr>\n    <tr>\n      <th>99</th>\n      <td>WOS:000209546000001</td>\n      <td>Tu, Mei-Lien</td>\n      <td>Chang Gung Mem Hosp, Kaohsiung Med Ctr, Dept R...</td>\n    </tr>\n  </tbody>\n</table>\n<p>100 rows × 3 columns</p>\n</div>"
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "locations[\"Address\"] = locations[\"Address\"].str.strip().str.strip(\";\")\n",
    "locations = locations.groupby([record_col,\"Authors_of_address\"])[\"Address\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_2\")\n",
    "locations.head(100)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "outputs": [],
   "source": [
    "# import dask.dataframe as dd\n",
    "#\n",
    "# locations_ddf = dd.from_pandas(locations, npartitions=4)  # convert pandas DataFrame to Dask DataFrame\n",
    "# loc_compute = locations_ddf.groupby([record_col,\"Authors_of_address\"])[\"Address\"].apply(lambda x: x.str.split(';')).explode().compute()  # compute the result"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "outputs": [],
   "source": [
    "# locations_test = locations.head(1000)\n",
    "# locations_test = locations_test.groupby([record_col,\"Authors_of_address\"])[\"Address\"].str.split(';').explode()\n",
    "# locations_test"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "outputs": [],
   "source": [
    "\n",
    "# locations[\"Country\"]=locations['Address'].apply(lambda x: extract_location(input_text=x, key='countries'))\n",
    "locations[\"Country\"]=locations['Address'].apply(lambda x: x.split(\",\")[-1].strip(\" \").strip(\";\").strip(\" \"))\n",
    "locations[\"Country\"]=locations['Country'].apply(lambda x: country_cleanup(x))\n",
    "locations[\"City\"]=locations['Address'].apply(lambda x: extract_location(input_text=x, key='cities'))\n",
    "locations[\"Country_Type\"] = locations[\"Country\"].apply(lambda x: country_type(x))"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "outputs": [],
   "source": [
    "scope_types = [\"EU\",\"China\",\"Non-EU associate\"]\n",
    "locations=locations[locations[\"Country_Type\"].isin(scope_types)]"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "    UT (Unique WOS ID)                                            Address   \n1  WOS:000208837000001  Univ Liege, Aerosp & Mech Engn Dept, LTAS MN2L...  \\\n2  WOS:000208837000001  Northwestern Polytech Univ, Key Lab Contempora...   \n3  WOS:000208837000001  Ctr Rech Publ Henri Tudor, Dept Adv Mat & Stru...   \n4  WOS:000208863600013  Zhejiang Univ, Dept Environm Engn, Hangzhou 31...   \n5  WOS:000208863600013  Delft Univ Technol, Dept Biotechnol, Delft, Ne...   \n\n       Country        City Country_Type                 Institution  \n1      Belgium       Liège           EU                  Univ Liege  \n2        China       Xi’an        China  Northwestern Polytech Univ  \n3   Luxembourg  Luxembourg           EU   Ctr Rech Publ Henri Tudor  \n4        China    Hangzhou        China               Zhejiang Univ  \n5  Netherlands       Delft           EU          Delft Univ Technol  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>UT (Unique WOS ID)</th>\n      <th>Address</th>\n      <th>Country</th>\n      <th>City</th>\n      <th>Country_Type</th>\n      <th>Institution</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>1</th>\n      <td>WOS:000208837000001</td>\n      <td>Univ Liege, Aerosp &amp; Mech Engn Dept, LTAS MN2L...</td>\n      <td>Belgium</td>\n      <td>Liège</td>\n      <td>EU</td>\n      <td>Univ Liege</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>WOS:000208837000001</td>\n      <td>Northwestern Polytech Univ, Key Lab Contempora...</td>\n      <td>China</td>\n      <td>Xi’an</td>\n      <td>China</td>\n      <td>Northwestern Polytech Univ</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>WOS:000208837000001</td>\n      <td>Ctr Rech Publ Henri Tudor, Dept Adv Mat &amp; Stru...</td>\n      <td>Luxembourg</td>\n      <td>Luxembourg</td>\n      <td>EU</td>\n      <td>Ctr Rech Publ Henri Tudor</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>WOS:000208863600013</td>\n      <td>Zhejiang Univ, Dept Environm Engn, Hangzhou 31...</td>\n      <td>China</td>\n      <td>Hangzhou</td>\n      <td>China</td>\n      <td>Zhejiang Univ</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>WOS:000208863600013</td>\n      <td>Delft Univ Technol, Dept Biotechnol, Delft, Ne...</td>\n      <td>Netherlands</td>\n      <td>Delft</td>\n      <td>EU</td>\n      <td>Delft Univ Technol</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "univ_locations = locations[[record_col,\"Address\",\"Country\",\"City\",\"Country_Type\"]].copy()\n",
    "univ_locations[\"Institution\"] = univ_locations[\"Address\"].apply(lambda x: x.split(\",\")[0])\n",
    "univ_locations = univ_locations.drop_duplicates()\n",
    "univ_locations.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "    UT (Unique WOS ID)  Country Country_Type                     author_str_id\n0  WOS:000208837000001  Belgium           EU  6079964a4094c607358a130e41e89f90\n1  WOS:000208837000001  Belgium           EU  2321037fa90ac94a23b88a79f1c7f454\n2  WOS:000208837000001  Belgium           EU  8a1bfa1e7bc52d323f0d9c23a9b74ed3\n3  WOS:000208837000001    China        China  6079964a4094c607358a130e41e89f90\n4  WOS:000208837000001    China        China  17fb036de6a4db3ba39ccab3d8307c04",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>UT (Unique WOS ID)</th>\n      <th>Country</th>\n      <th>Country_Type</th>\n      <th>author_str_id</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>WOS:000208837000001</td>\n      <td>Belgium</td>\n      <td>EU</td>\n      <td>6079964a4094c607358a130e41e89f90</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>WOS:000208837000001</td>\n      <td>Belgium</td>\n      <td>EU</td>\n      <td>2321037fa90ac94a23b88a79f1c7f454</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>WOS:000208837000001</td>\n      <td>Belgium</td>\n      <td>EU</td>\n      <td>8a1bfa1e7bc52d323f0d9c23a9b74ed3</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>WOS:000208837000001</td>\n      <td>China</td>\n      <td>China</td>\n      <td>6079964a4094c607358a130e41e89f90</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>WOS:000208837000001</td>\n      <td>China</td>\n      <td>China</td>\n      <td>17fb036de6a4db3ba39ccab3d8307c04</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "author_locations = locations.groupby([record_col,\"Country\",\"Country_Type\"])[\"Authors_of_address\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_3\")\n",
    "author_locations[\"Author_name\"] = author_locations[\"Authors_of_address\"].str.strip()\n",
    "author_locations = author_locations.drop(columns=\"Authors_of_address\")\n",
    "author_locations[\"author_str_id\"] = author_locations[\"Author_name\"].apply(lambda x:''.join(filter(str.isalnum, x.lower())))\n",
    "author_locations[\"author_str_id\"] = author_locations[\"author_str_id\"].apply(md5hash)\n",
    "author_locations = author_locations.drop(columns=\"Author_name\")\n",
    "author_locations.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "outputs": [
    {
     "data": {
      "text/plain": "         UT (Unique WOS ID)      Country      Country_Type   \n0       WOS:000208837000001      Belgium                EU  \\\n3       WOS:000208837000001        China             China   \n4       WOS:000208837000001        China             China   \n6       WOS:000208863600013        China             China   \n7       WOS:000208863600013  Netherlands                EU   \n...                     ...          ...               ...   \n643323  WOS:000964683900016        Italy                EU   \n643324  WOS:000964683900016        Italy                EU   \n643325  WOS:000967389100001        China             China   \n643326  WOS:000967389100001       Norway  Non-EU associate   \n643327  WOS:000967389100001       Norway  Non-EU associate   \n\n                           author_str_id  \n0       6079964a4094c607358a130e41e89f90  \n3       6079964a4094c607358a130e41e89f90  \n4       17fb036de6a4db3ba39ccab3d8307c04  \n6       54c7bc6fe9b77434ca1bf04d763d843b  \n7       df81f9da6c8f5c968c16ef0aab1bb8f9  \n...                                  ...  \n643323  3c631398a81ab7058d95a0c6418a2c0b  \n643324  3c631398a81ab7058d95a0c6418a2c0b  \n643325  ce65541a6c334225a9617439f4a95012  \n643326  7c52a53f8d79b1ffd4f2e4cde9548e1d  \n643327  7c52a53f8d79b1ffd4f2e4cde9548e1d  \n\n[573569 rows x 4 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>UT (Unique WOS ID)</th>\n      <th>Country</th>\n      <th>Country_Type</th>\n      <th>author_str_id</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>WOS:000208837000001</td>\n      <td>Belgium</td>\n      <td>EU</td>\n      <td>6079964a4094c607358a130e41e89f90</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>WOS:000208837000001</td>\n      <td>China</td>\n      <td>China</td>\n      <td>6079964a4094c607358a130e41e89f90</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>WOS:000208837000001</td>\n      <td>China</td>\n      <td>China</td>\n      <td>17fb036de6a4db3ba39ccab3d8307c04</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>WOS:000208863600013</td>\n      <td>China</td>\n      <td>China</td>\n      <td>54c7bc6fe9b77434ca1bf04d763d843b</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>WOS:000208863600013</td>\n      <td>Netherlands</td>\n      <td>EU</td>\n      <td>df81f9da6c8f5c968c16ef0aab1bb8f9</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>643323</th>\n      <td>WOS:000964683900016</td>\n      <td>Italy</td>\n      <td>EU</td>\n      <td>3c631398a81ab7058d95a0c6418a2c0b</td>\n    </tr>\n    <tr>\n      <th>643324</th>\n      <td>WOS:000964683900016</td>\n      <td>Italy</td>\n      <td>EU</td>\n      <td>3c631398a81ab7058d95a0c6418a2c0b</td>\n    </tr>\n    <tr>\n      <th>643325</th>\n      <td>WOS:000967389100001</td>\n      <td>China</td>\n      <td>China</td>\n      <td>ce65541a6c334225a9617439f4a95012</td>\n    </tr>\n    <tr>\n      <th>643326</th>\n      <td>WOS:000967389100001</td>\n      <td>Norway</td>\n      <td>Non-EU associate</td>\n      <td>7c52a53f8d79b1ffd4f2e4cde9548e1d</td>\n    </tr>\n    <tr>\n      <th>643327</th>\n      <td>WOS:000967389100001</td>\n      <td>Norway</td>\n      <td>Non-EU associate</td>\n      <td>7c52a53f8d79b1ffd4f2e4cde9548e1d</td>\n    </tr>\n  </tbody>\n</table>\n<p>573569 rows × 4 columns</p>\n</div>"
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "author_locations[author_locations['author_str_id'].duplicated(False)]"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "author_primary_region = author_locations.sort_values(by=\"Country_Type\").drop_duplicates(subset=[record_col,\"author_str_id\"])\n",
    "# author_primary_region\n",
    "\n",
    "china=author_primary_region[author_primary_region[\"Country_Type\"]==\"China\"][record_col].unique()\n",
    "eu=author_primary_region[author_primary_region[\"Country_Type\"]==\"EU\"][record_col].unique()\n",
    "assoc=author_primary_region[author_primary_region[\"Country_Type\"]==\"Non-EU associate\"][record_col].unique()\n",
    "\n",
    "\n",
    "# records that have distinct authors with different country affiliations\n",
    "valid_scope = wos[((wos[record_col].isin(china))\n",
    "         &\n",
    "         ((wos[record_col].isin(eu))\n",
    "         |\n",
    "         (wos[record_col].isin(assoc))))][record_col].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "outputs": [
    {
     "data": {
      "text/plain": "         UT (Unique WOS ID) Country Country_Type   \n537692  WOS:000732204600001   China        China  \\\n204027  WOS:000414089800001   China        China   \n204028  WOS:000414089800001   China        China   \n204029  WOS:000414089800001   China        China   \n204030  WOS:000414090800001   China        China   \n\n                           author_str_id  \n537692  8fe31cbbd07c639aa4d779688896be81  \n204027  67c7beb18fafd77f1319739fa683bc5e  \n204028  7269f0a31fc620688aae12aad9e3cd85  \n204029  ac28aea698a527fb5195d3d24189ea04  \n204030  6c91bf481b6bddc1426d12a18823224a  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>UT (Unique WOS ID)</th>\n      <th>Country</th>\n      <th>Country_Type</th>\n      <th>author_str_id</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>537692</th>\n      <td>WOS:000732204600001</td>\n      <td>China</td>\n      <td>China</td>\n      <td>8fe31cbbd07c639aa4d779688896be81</td>\n    </tr>\n    <tr>\n      <th>204027</th>\n      <td>WOS:000414089800001</td>\n      <td>China</td>\n      <td>China</td>\n      <td>67c7beb18fafd77f1319739fa683bc5e</td>\n    </tr>\n    <tr>\n      <th>204028</th>\n      <td>WOS:000414089800001</td>\n      <td>China</td>\n      <td>China</td>\n      <td>7269f0a31fc620688aae12aad9e3cd85</td>\n    </tr>\n    <tr>\n      <th>204029</th>\n      <td>WOS:000414089800001</td>\n      <td>China</td>\n      <td>China</td>\n      <td>ac28aea698a527fb5195d3d24189ea04</td>\n    </tr>\n    <tr>\n      <th>204030</th>\n      <td>WOS:000414090800001</td>\n      <td>China</td>\n      <td>China</td>\n      <td>6c91bf481b6bddc1426d12a18823224a</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "author_primary_region.head()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of records: 51904\n",
      "Number of valid cooperation records: 46060\n"
     ]
    }
   ],
   "source": [
    "print(f'Number of records: {len(wos)}')\n",
    "print(f'Number of valid cooperation records: {len(valid_scope)}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "outputs": [],
   "source": [
    "wos = wos[wos[record_col].isin(valid_scope)]\n",
    "locations = locations[locations[record_col].isin(valid_scope)]\n",
    "univ_locations = univ_locations[univ_locations[record_col].isin(valid_scope)]\n",
    "author_locations = author_locations[author_locations[record_col].isin(valid_scope)]\n",
    "author_primary_region = author_locations[author_locations[record_col].isin(valid_scope)]"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "affiliations = wos.groupby(record_col)[\"Affiliations\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
    "affiliations[\"Affiliations\"] = affiliations[\"Affiliations\"].str.strip().str.upper().fillna(\"UNKNOWN\")\n",
    "affiliations = affiliations.drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "outputs": [
    {
     "data": {
      "text/plain": "Affiliations\nCHINESE ACADEMY OF SCIENCES                                                       5616\nUNIVERSITY OF LONDON                                                              2604\nUDICE-FRENCH RESEARCH UNIVERSITIES                                                2240\nCENTRE NATIONAL DE LA RECHERCHE SCIENTIFIQUE (CNRS)                               2170\nTSINGHUA UNIVERSITY                                                               1935\n                                                                                  ... \nUNIVERSITY OF FUKUI                                                                  1\nPONTIFICIA UNIVERSIDADE CATOLICA DE GOIAS                                            1\nINSTITUTE OF ORGANIC CHEMISTRY & BIOCHEMISTRY OF THE CZECH ACADEMY OF SCIENCES       1\nUNIVERSITAS PELITA HARAPAN                                                           1\nFRANCISCUS GASTHUIS                                                                  1\nName: count, Length: 7609, dtype: int64"
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "affiliations[\"Affiliations\"].value_counts()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "outputs": [
    {
     "data": {
      "text/plain": "Institution\nChinese Acad Sci                                 5749\nTsinghua Univ                                    2315\nShanghai Jiao Tong Univ                          1976\nZhejiang Univ                                    1806\nPeking Univ                                      1661\n                                                 ... \nNatl Technol Inst Mental Disorders                  1\nSeinajoki Univ Appl Sci                             1\nJD Intelligent City Res                             1\nCAS Ctr Excellence Planetol                         1\nKey Lab Intelligent Prevent Med Zhejiang Prov       1\nName: count, Length: 19821, dtype: int64"
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "univ_locations[\"Institution\"].value_counts()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "outputs": [
    {
     "data": {
      "text/plain": "46060"
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "univ_locations[record_col].nunique()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "outputs": [
    {
     "data": {
      "text/plain": "46060"
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "affiliations[record_col].nunique()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "outputs": [
    {
     "data": {
      "text/plain": "202790"
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "univ_locations[\"Institution\"].value_counts().sum()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "outputs": [
    {
     "data": {
      "text/plain": "268471"
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "affiliations[\"Affiliations\"].value_counts().sum()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "WoS Categories\n Engineering, Electrical & Electronic        8303\nComputer Science, Artificial Intelligence    6115\n Telecommunications                          4661\nComputer Science, Information Systems        4584\nEngineering, Electrical & Electronic         4036\n                                             ... \nCultural Studies                                1\n Ornithology                                    1\n Criminology & Penology                         1\nArt                                             1\n Psychology, Developmental                      1\nName: count, Length: 425, dtype: int64"
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wos_cat = wos.groupby(record_col)[\"WoS Categories\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
    "wos_cat[\"WoS Categories\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "Research Areas\nEngineering                                  18098\nComputer Science                             15658\nTelecommunications                            5046\nEnvironmental Sciences & Ecology              3246\nImaging Science & Photographic Technology     2947\n                                             ...  \nFilm, Radio & Television                         2\nArea Studies                                     2\nCultural Studies                                 1\nAsian Studies                                    1\nMusic                                            1\nName: count, Length: 145, dtype: int64"
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wos_areas = wos.groupby(record_col)[\"Research Areas\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
    "wos_areas[\"Research Areas\"] = wos_areas[\"Research Areas\"].str.strip()\n",
    "wos_areas[\"Research Areas\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "['Domain_English', 'Field_English', 'SubField_English']"
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[c for c in wos.columns if \"_English\" in c]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "metrix_levels = [c for c in wos.columns if \"_English\" in c]\n",
    "for m in metrix_levels:\n",
    "    wos[m] = wos[m].replace({\"article-level classification\":\"Multidisciplinary\"})\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "      Publication Type                                            Authors   \n0                    J                       Yan, Z; Jing, XY; Pedrycz, W  \\\n1                    J  Sookhak, M; Yu, FR; He, Y; Talebian, H; Safa, ...   \n2                    J  Ning, ZL; Dong, PR; Wang, XJ; Guo, L; Rodrigue...   \n3                    J  Wang, XD; Garg, S; Lin, H; Kaddoum, G; Hu, J; ...   \n4                    J  Lu, TG; Chen, XY; McElroy, MB; Nielsen, CP; Wu...   \n...                ...                                                ...   \n51897                J  Lai, ZL; Liu, W; Jian, XD; Bacsa, K; Sun, LM; ...   \n51898                J                     Wang, HC; Roussel, P; Denby, B   \n51899                J       Zhang, R; Alpdogan, S; Kong, SQ; Muhammad, S   \n51902                J                                   Chu, WP; Song, Y   \n51903                J  Lai, CS; Jia, YW; Dong, ZK; Wang, DX; Tao, YS;...   \n\n      Book Authors Book Editors Book Group Authors   \n0              NaN          NaN                NaN  \\\n1              NaN          NaN                NaN   \n2              NaN          NaN                NaN   \n3              NaN          NaN                NaN   \n4              NaN          NaN                NaN   \n...            ...          ...                ...   \n51897          NaN          NaN                NaN   \n51898          NaN          NaN                NaN   \n51899          NaN          NaN                NaN   \n51902          NaN          NaN                NaN   \n51903          NaN          NaN                NaN   \n\n                                       Author Full Names   \n0              Yan, Zheng; Jing, Xuyang; Pedrycz, Witold  \\\n1      Sookhak, Mehdi; Yu, F. Richard; He, Ying; Tale...   \n2      Ning, Zhaolong; Dong, Peiran; Wang, Xiaojie; G...   \n3      Wang, Xiaoding; Garg, Sahil; Lin, Hui; Kaddoum...   \n4      Lu, Tianguang; Chen, Xinyu; McElroy, Michael B...   \n...                                                  ...   \n51897  Lai, Zhilu; Liu, Wei; Jian, Xudong; Bacsa, Kir...   \n51898       Wang, Hongcui; Roussel, Pierre; Denby, Bruce   \n51899  Zhang, Rui; Alpdogan, Serdar; Kong, Shiqi; Muh...   \n51902                           Chu, Wenping; Song, Yang   \n51903  Lai, Chun Sing; Jia, Youwei; Dong, Zhekang; Wa...   \n\n      Book Author Full Names Group Authors   \n0                        NaN           NaN  \\\n1                        NaN           NaN   \n2                        NaN           NaN   \n3                        NaN           NaN   \n4                        NaN           NaN   \n...                      ...           ...   \n51897                    NaN           NaN   \n51898                    NaN           NaN   \n51899                    NaN           NaN   \n51902                    NaN           NaN   \n51903                    NaN           NaN   \n\n                                           Article Title   \n0      LEFusing and mining opinions for reputation ge...  \\\n1      FOG VEHICULAR COMPUTING Augmentation of Fog Co...   \n2      Deep Reinforcement Learning for Intelligent In...   \n3      An Intelligent UAV based Data Aggregation Algo...   \n4      A Reinforcement Learning-Based Decision System...   \n...                                                  ...   \n51897  Neural modal ordinary differential equations: ...   \n51898  Improving ultrasound-based multimodal speech r...   \n51899  Application of computer-aided image reconstruc...   \n51902  Study on Dynamic Interaction of Railway Pantog...   \n51903   A Review of Technical Standards for Smart Cities   \n\n                                            Source Title  ...   \n0                                     INFORMATION FUSION  ...  \\\n1                     IEEE VEHICULAR TECHNOLOGY MAGAZINE  ...   \n2      IEEE TRANSACTIONS ON COGNITIVE COMMUNICATIONS ...  ...   \n3                                      COMPUTER NETWORKS  ...   \n4                        IEEE TRANSACTIONS ON SMART GRID  ...   \n...                                   
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Publication Type</th>\n      <th>Authors</th>\n      <th>Book Authors</th>\n      <th>Book Editors</th>\n      <th>Book Group Authors</th>\n      <th>Author Full Names</th>\n      <th>Book Author Full Names</th>\n      <th>Group Authors</th>\n      <th>Article Title</th>\n      <th>Source Title</th>\n      <th>...</th>\n      <th>UT (Unique WOS ID)</th>\n      <th>issn_var</th>\n      <th>issn</th>\n      <th>Domain_English</th>\n      <th>Field_English</th>\n      <th>SubField_English</th>\n      <th>2.00 SEQ</th>\n      <th>Source_title</th>\n      <th>srcid</th>\n      <th>issn_type</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>J</td>\n      <td>Yan, Z; Jing, XY; Pedrycz, W</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Yan, Zheng; Jing, Xuyang; Pedrycz, Witold</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>LEFusing and mining opinions for reputation ge...</td>\n      <td>INFORMATION FUSION</td>\n      <td>...</td>\n      <td>WOS:000394070100013</td>\n      <td>issn</td>\n      <td>15662535</td>\n      <td>Applied Sciences</td>\n      <td>Information &amp; Communication Technologies</td>\n      <td>Artificial Intelligence &amp; Image Processing</td>\n      <td>31</td>\n      <td>Information Fusion</td>\n      <td>2.609900e+04</td>\n      <td>issn1</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>J</td>\n      <td>Sookhak, M; Yu, FR; He, Y; Talebian, H; Safa, ...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Sookhak, Mehdi; Yu, F. Richard; He, Ying; Tale...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>FOG VEHICULAR COMPUTING Augmentation of Fog Co...</td>\n      <td>IEEE VEHICULAR TECHNOLOGY MAGAZINE</td>\n      <td>...</td>\n      <td>WOS:000408568800008</td>\n      <td>issn</td>\n      <td>15566072</td>\n      <td>Applied Sciences</td>\n      <td>Information &amp; Communication Technologies</td>\n      <td>Networking &amp; Telecommunications</td>\n      <td>37</td>\n      <td>IEEE Vehicular Technology Magazine</td>\n      <td>5.200153e+09</td>\n      <td>issn1</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>J</td>\n      <td>Ning, ZL; Dong, PR; Wang, XJ; Guo, L; Rodrigue...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Ning, Zhaolong; Dong, Peiran; Wang, Xiaojie; G...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Deep Reinforcement Learning for Intelligent In...</td>\n      <td>IEEE TRANSACTIONS ON COGNITIVE COMMUNICATIONS ...</td>\n      <td>...</td>\n      <td>WOS:000502789700018</td>\n      <td>issn</td>\n      <td>23327731</td>\n      <td>Applied Sciences</td>\n      <td>Information &amp; Communication Technologies</td>\n      <td>Networking &amp; Telecommunications</td>\n      <td>37</td>\n      <td>IEEE Transactions on Cognitive Communications ...</td>\n      <td>2.110085e+10</td>\n      <td>issn1</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>J</td>\n      <td>Wang, XD; Garg, S; Lin, H; Kaddoum, G; Hu, J; ...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Wang, Xiaoding; Garg, Sahil; Lin, Hui; Kaddoum...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>An Intelligent UAV based Data Aggregation Algo...</td>\n      <td>COMPUTER NETWORKS</td>\n      <td>...</td>\n      <td>WOS:000626758800004</td>\n      <td>issn</td>\n      <td>13891286</td>\n      <td>Applied Sciences</td>\n      <td>Information &amp; Communication Technologies</td>\n      <td>Networking &amp; Telecommunications</td>\n      <td>37</td>\n      <td>Computer Networks</td>\n      <td>2.681100e+04</td>\n      <td>issn1</td>\n    </tr>\n    <tr>\n      <th>4</th>\n
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wos"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "['Domain_English', 'Field_English', 'SubField_English']"
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "metrix_levels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "outputs": [],
   "source": [
    "record_countries = locations[[record_col,\"Country\"]].drop_duplicates()\n",
    "record_author_locations = author_locations[[record_col,\"author_str_id\",\"Country\"]].drop_duplicates()\n",
    "record_institution = univ_locations[[record_col,\"Institution\",\"Country\"]].drop_duplicates()\n",
    "country_types = locations[[\"Country\",\"Country_Type\"]].drop_duplicates()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "outputs": [],
   "source": [
    "# Basic network layout"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "outputs": [],
   "source": [
    "country_collabs = record_countries.merge(record_countries, on=record_col)\n",
    "country_collabs = country_collabs[country_collabs[\"Country_x\"]!=country_collabs[\"Country_y\"]]\n",
    "country_collabs[\"weight\"] = 0.5"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "outputs": [],
   "source": [
    "inst_collabs = record_institution.merge(record_institution, on=record_col)\n",
    "inst_collabs = inst_collabs[inst_collabs[\"Institution_x\"]!=inst_collabs[\"Institution_y\"]]\n",
    "inst_collabs[\"weight\"] = 0.5"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "outputs": [
    {
     "data": {
      "text/plain": "Index(['Publication Type', 'Authors', 'Book Authors', 'Book Editors',\n       'Book Group Authors', 'Author Full Names', 'Book Author Full Names',\n       'Group Authors', 'Article Title', 'Source Title', 'Book Series Title',\n       'Book Series Subtitle', 'Language', 'Document Type', 'Conference Title',\n       'Conference Date', 'Conference Location', 'Conference Sponsor',\n       'Conference Host', 'Author Keywords', 'Keywords Plus', 'Abstract',\n       'Addresses', 'Affiliations', 'Reprint Addresses', 'Email Addresses',\n       'Researcher Ids', 'ORCIDs', 'Funding Orgs', 'Funding Name Preferred',\n       'Funding Text', 'Cited References', 'Cited Reference Count',\n       'Times Cited, WoS Core', 'Times Cited, All Databases',\n       '180 Day Usage Count', 'Since 2013 Usage Count', 'Publisher',\n       'Publisher City', 'Publisher Address', 'ISSN', 'eISSN', 'ISBN',\n       'Journal Abbreviation', 'Journal ISO Abbreviation', 'Publication Date',\n       'Publication Year', 'Volume', 'Issue', 'Part Number', 'Supplement',\n       'Special Issue', 'Meeting Abstract', 'Start Page', 'End Page',\n       'Article Number', 'DOI', 'DOI Link', 'Book DOI', 'Early Access Date',\n       'Number of Pages', 'WoS Categories', 'Web of Science Index',\n       'Research Areas', 'IDS Number', 'Pubmed Id', 'Open Access Designations',\n       'Highly Cited Status', 'Hot Paper Status', 'Date of Export',\n       'UT (Unique WOS ID)', 'issn_var', 'issn', 'Domain_English',\n       'Field_English', 'SubField_English', '2.00 SEQ', 'Source_title',\n       'srcid', 'issn_type'],\n      dtype='object')"
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wos.columns"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "outputs": [
    {
     "data": {
      "text/plain": "['Authors',\n 'Book Authors',\n 'Book Editors',\n 'Book Group Authors',\n 'Author Full Names',\n 'Book Author Full Names',\n 'Group Authors',\n 'Addresses',\n 'Reprint Addresses',\n 'Email Addresses',\n 'Researcher Ids',\n 'ORCIDs',\n 'Publisher Address',\n '2.00 SEQ']"
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "drop_cols = [ws for ws in wos.columns if ((\"uthor\" in ws or \"ddress\" in ws or \"ORCID\" in\n",
    "                                           ws or \"esearcher\" in ws or \"ditor\" in ws or \"name\" in ws or 'SEQ' in ws) and \"eyword\" not in ws)]\n",
    "drop_cols"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "outputs": [],
   "source": [
    "outdir=\"wos_processed_data\""
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "outputs": [],
   "source": [
    "os.makedirs(outdir, exist_ok=True)\n",
    "\n",
    "wos.drop(columns=drop_cols).to_excel(f\"{outdir}/wos_processed.xlsx\", index=False)\n",
    "\n",
    "record_countries.to_excel(f\"{outdir}/wos_countries.xlsx\", index=False)\n",
    "\n",
    "record_author_locations.to_excel(f\"{outdir}/wos_author_locations.xlsx\", index=False)\n",
    "\n",
    "record_institution.to_excel(f\"{outdir}/wos_institution_locations.xlsx\", index=False)\n",
    "\n",
    "kw_df.to_excel(f\"{outdir}/wos_keywords.xlsx\", index=False)\n",
    "\n",
    "country_types.to_excel(f\"{outdir}/wos_country_types.xlsx\", index=False)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "outputs": [],
   "source": [
    "wos.drop(columns=drop_cols).to_csv(f\"{outdir}/wos_processed.csv\", index=False, sep='\\t')\n",
    "\n",
    "record_countries.to_csv(f\"{outdir}/wos_countries.csv\", index=False, sep='\\t')\n",
    "\n",
    "record_author_locations.to_csv(f\"{outdir}/wos_author_locations.csv\", index=False, sep='\\t')\n",
    "\n",
    "record_institution.to_csv(f\"{outdir}/wos_institution_locations.csv\", index=False, sep='\\t')\n",
    "\n",
    "kw_df.to_csv(f\"{outdir}/wos_keywords.csv\", index=False, sep='\\t')\n",
    "\n",
    "country_types.to_csv(f\"{outdir}/wos_country_types.csv\", index=False, sep='\\t')\n",
    "\n",
    "inst_collabs.to_csv(f\"{outdir}/wos_inst_collabs.csv\", index=False, sep='\\t')\n",
    "\n",
    "country_collabs.to_csv(f\"{outdir}/wos_country_collabs.csv\", index=False, sep='\\t')"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "outputs": [],
   "source": [
    "wos_areas.to_csv(f\"{outdir}/wos_research_areas.csv\", index=False, sep='\\t')\n",
    "\n",
    "wos_subcat.to_csv(f\"{outdir}/wos_categories.csv\", index=False, sep='\\t')"
   ],
   "metadata": {
    "collapsed": false
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}