{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 7,
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import os\n",
    "import shutil\n",
    "from flashgeotext.geotext import GeoText\n",
    "import re\n",
    "# import spacy\n",
    "#\n",
    "# nlp = spacy.load(\"en_core_web_lg\")"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "record_col=\"UT (Unique WOS ID)\"\n",
    "outfile = r\"C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\WOS\\wos_extract\\wos_records_concat.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of initial records: 27672\n",
      "Number of filtered records: 24653\n"
     ]
    }
   ],
   "source": [
    "wos = pd.read_csv(outfile, sep=\"\\t\",low_memory=False)\n",
    "print(f'Number of initial records: {len(wos)}')\n",
    "metrix = pd.read_excel(\"sm_journal_classification.xlsx\", sheet_name=\"Journal_Classification\")\n",
    "\n",
    "\n",
    "metrix = metrix.set_index([c for c in metrix.columns if \"issn\" not in c]).stack().reset_index()\n",
    "metrix = metrix.rename(columns={'level_6':\"issn_type\", 0:\"issn\"})\n",
    "metrix[\"issn\"]=metrix[\"issn\"].str.replace(\"-\",\"\").str.lower().str.strip()\n",
    "\n",
    "wos[\"issn\"] = wos[\"ISSN\"].str.replace(\"-\",\"\").str.lower().str.strip()\n",
    "wos[\"eissn\"] = wos[\"eISSN\"].str.replace(\"-\",\"\").str.lower().str.strip()\n",
    "wos = wos.set_index([c for c in wos.columns if \"issn\" not in c]).stack().reset_index()\n",
    "wos = wos.rename(columns={'level_71':\"issn_var\", 0:\"issn\"})\n",
    "\n",
    "wos_merge = wos.merge(metrix, on=\"issn\", how=\"left\")\n",
    "wos = wos_merge.sort_values(by=\"issn_var\",ascending=False).drop_duplicates(subset=record_col)\n",
    "\n",
    "# drop entries not indexed by metrix\n",
    "wos = wos[~wos[\"Domain_English\"].isna()]\n",
    "# drop duplicates (based on doi)\n",
    "wos = wos[~((~wos[\"DOI\"].isna())&(wos[\"DOI\"].duplicated(False)))]\n",
    "wos = wos.drop_duplicates(subset=[\"Publication Type\",\"Document Type\",\"Authors\",\"Article Title\",\"Source Title\",\"Publication Year\"])\n",
    "print(f'Number of filtered records: {len(wos)}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "outputs": [
    {
     "data": {
      "text/plain": "                                           Article Title   \n23070  Stochastic bias of colour-selected BAO tracers...  \\\n30139  A novel integrative approach elucidates fine-s...   \n4538   Optimal Number of Clusters by Measuring Simila...   \n34242  Analyzing the Noise Robustness of Deep Neural ...   \n26727  Learning to Prompt for Open-Vocabulary Object ...   \n...                                                  ...   \n3290   Research on Reverse Skyline Query Algorithm Ba...   \n45159  Using Recurrent Neural Network for Intelligent...   \n21653  Output-Bounded and RBFNN-Based Position Tracki...   \n43983  A Novel 3D Intelligent Cluster Method for Mali...   \n11880  BlockHammer: Improving Flash Reliability by Ex...   \n\n                                           Keywords Plus   \n23070  DIGITAL SKY SURVEY; BARYON ACOUSTIC-OSCILLATIO...  \\\n30139  CHAOTIC GENETIC PATCHINESS; PELAGIC LARVAL DUR...   \n4538                     VALIDATION; ALGORITHM; TUTORIAL   \n34242                                   VISUAL ANALYTICS   \n26727                                                NaN   \n...                                                  ...   \n3290                                           MAPREDUCE   \n45159                                                NaN   \n21653                           IMPEDANCE CONTROL; ROBOT   \n43983  NETWORK INTRUSION DETECTION; DDOS DETECTION; A...   \n11880  MEMORY; PERFORMANCE; RETENTION; ENDURANCE; OPT...   \n\n                                         Author Keywords  \n23070  galaxies: evolution; galaxies: haloes; galaxie...  \n30139                                                NaN  \n4538   Event-related potentials; Optimal number of cl...  \n34242  Neurons; Visualization; Data visualization; Fe...  \n26727                                                NaN  \n...                                                  ...  \n3290   Big Data; Database Management; Database Query;...  \n45159  water resources; intelligent prediction; water...  \n21653  Security tele-surgery; RBFNN; bilateral positi...  \n43983  Auto encoder; DDos detection; Attack classific...  \n11880  Reliability; Three-dimensional displays; Error...  \n\n[100 rows x 3 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Article Title</th>\n      <th>Keywords Plus</th>\n      <th>Author Keywords</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>23070</th>\n      <td>Stochastic bias of colour-selected BAO tracers...</td>\n      <td>DIGITAL SKY SURVEY; BARYON ACOUSTIC-OSCILLATIO...</td>\n      <td>galaxies: evolution; galaxies: haloes; galaxie...</td>\n    </tr>\n    <tr>\n      <th>30139</th>\n      <td>A novel integrative approach elucidates fine-s...</td>\n      <td>CHAOTIC GENETIC PATCHINESS; PELAGIC LARVAL DUR...</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>4538</th>\n      <td>Optimal Number of Clusters by Measuring Simila...</td>\n      <td>VALIDATION; ALGORITHM; TUTORIAL</td>\n      <td>Event-related potentials; Optimal number of cl...</td>\n    </tr>\n    <tr>\n      <th>34242</th>\n      <td>Analyzing the Noise Robustness of Deep Neural ...</td>\n      <td>VISUAL ANALYTICS</td>\n      <td>Neurons; Visualization; Data visualization; Fe...</td>\n    </tr>\n    <tr>\n      <th>26727</th>\n      <td>Learning to Prompt for Open-Vocabulary Object ...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>3290</th>\n      <td>Research on Reverse Skyline Query Algorithm Ba...</td>\n      <td>MAPREDUCE</td>\n      <td>Big Data; Database Management; Database Query;...</td>\n    </tr>\n    <tr>\n      <th>45159</th>\n      <td>Using Recurrent Neural Network for Intelligent...</td>\n      <td>NaN</td>\n      <td>water resources; intelligent prediction; water...</td>\n    </tr>\n    <tr>\n      <th>21653</th>\n      <td>Output-Bounded and RBFNN-Based Position Tracki...</td>\n      <td>IMPEDANCE CONTROL; ROBOT</td>\n      <td>Security tele-surgery; RBFNN; bilateral positi...</td>\n    </tr>\n    <tr>\n      <th>43983</th>\n      <td>A Novel 3D Intelligent Cluster Method for Mali...</td>\n      <td>NETWORK INTRUSION DETECTION; DDOS DETECTION; A...</td>\n      <td>Auto encoder; DDos detection; Attack classific...</td>\n    </tr>\n    <tr>\n      <th>11880</th>\n      <td>BlockHammer: Improving Flash Reliability by Ex...</td>\n      <td>MEMORY; PERFORMANCE; RETENTION; ENDURANCE; OPT...</td>\n      <td>Reliability; Three-dimensional displays; Error...</td>\n    </tr>\n  </tbody>\n</table>\n<p>100 rows × 3 columns</p>\n</div>"
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wos[[\"Article Title\",\"Keywords Plus\",\"Author Keywords\"]].sample(100)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "outputs": [
    {
     "data": {
      "text/plain": "      UT (Unique WOS ID)          keyword_all\n0    WOS:000208863600266                 ANME\n1    WOS:000208863600266       PYROSEQUENCING\n2    WOS:000208863600266                  AOM\n3    WOS:000208863600266  COMMUNITY STRUCTURE\n4    WOS:000208863600266               NYEGGA\n..                   ...                  ...\n99   WOS:000286328200009      NORTH-EAST ASIA\n100  WOS:000286328200009          PLEISTOCENE\n101  WOS:000286328200009      SAKHALIN ISLAND\n102  WOS:000286373200134      NEURAL NETWORKS\n103  WOS:000286373200134          FUZZY LOGIC\n\n[100 rows x 2 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>UT (Unique WOS ID)</th>\n      <th>keyword_all</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>WOS:000208863600266</td>\n      <td>ANME</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>WOS:000208863600266</td>\n      <td>PYROSEQUENCING</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>WOS:000208863600266</td>\n      <td>AOM</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>WOS:000208863600266</td>\n      <td>COMMUNITY STRUCTURE</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>WOS:000208863600266</td>\n      <td>NYEGGA</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>99</th>\n      <td>WOS:000286328200009</td>\n      <td>NORTH-EAST ASIA</td>\n    </tr>\n    <tr>\n      <th>100</th>\n      <td>WOS:000286328200009</td>\n      <td>PLEISTOCENE</td>\n    </tr>\n    <tr>\n      <th>101</th>\n      <td>WOS:000286328200009</td>\n      <td>SAKHALIN ISLAND</td>\n    </tr>\n    <tr>\n      <th>102</th>\n      <td>WOS:000286373200134</td>\n      <td>NEURAL NETWORKS</td>\n    </tr>\n    <tr>\n      <th>103</th>\n      <td>WOS:000286373200134</td>\n      <td>FUZZY LOGIC</td>\n    </tr>\n  </tbody>\n</table>\n<p>100 rows × 2 columns</p>\n</div>"
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kw_df = pd.DataFrame()\n",
    "for c in [\"Keywords Plus\",\"Author Keywords\"]:\n",
    "    kwp = wos.groupby(record_col)[c].apply(lambda x: x.str.split(';')).explode().str.strip().str.upper()\n",
    "    kwp.name = 'keyword_all'\n",
    "    kw_df = pd.concat([kwp.reset_index(),kw_df],ignore_index=True)\n",
    "kw_df = kw_df[~kw_df[\"keyword_all\"].isna()].copy().drop(columns=\"level_1\").drop_duplicates()\n",
    "kw_df[\"keyword_all\"] = kw_df[\"keyword_all\"].apply(lambda x: re.sub(\"[\\(\\[].*?[\\)\\]]\", \"\", x))\n",
    "kw_df.head(100)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "outputs": [
    {
     "data": {
      "text/plain": "    UT (Unique WOS ID)                                        keyword_all\n0  WOS:000208863600266  ANME; PYROSEQUENCING; AOM; COMMUNITY STRUCTURE...\n1  WOS:000209236900010  ACTIVE PERCEPTION; SPARSE CODING; REINFORCEMEN...\n2  WOS:000209331600009  SLEEP PATTERN; ELDER-CARE; PRESSURE SENSOR; NA...\n3  WOS:000209571700012  PERSONALIZED MEDICINE; COMPLEX NETWORK; CLINIC...\n4  WOS:000209810700046  CORROSION CHARACTERIZATION; FEATURE EXTRACTION...",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>UT (Unique WOS ID)</th>\n      <th>keyword_all</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>WOS:000208863600266</td>\n      <td>ANME; PYROSEQUENCING; AOM; COMMUNITY STRUCTURE...</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>WOS:000209236900010</td>\n      <td>ACTIVE PERCEPTION; SPARSE CODING; REINFORCEMEN...</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>WOS:000209331600009</td>\n      <td>SLEEP PATTERN; ELDER-CARE; PRESSURE SENSOR; NA...</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>WOS:000209571700012</td>\n      <td>PERSONALIZED MEDICINE; COMPLEX NETWORK; CLINIC...</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>WOS:000209810700046</td>\n      <td>CORROSION CHARACTERIZATION; FEATURE EXTRACTION...</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wos_kwd_concat = kw_df.groupby(record_col, as_index=False).agg({'keyword_all': '; '.join})\n",
    "wos_kwd_concat.head()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "outputs": [],
   "source": [
    "# from keybert import KeyBERT\n",
    "#\n",
    "# kw_model = KeyBERT(model='all-mpnet-base-v2')\n",
    "#\n",
    "# def kwd_extract(text):\n",
    "#     keywords = kw_model.extract_keywords(text,\n",
    "#\n",
    "#                                      keyphrase_ngram_range=(1, 2),\n",
    "#\n",
    "#                                      stop_words='english',\n",
    "#\n",
    "#                                      highlight=False,\n",
    "#\n",
    "#                                      top_n=3)\n",
    "#     return \"; \".join([i[0].upper() for i in keywords])\n",
    "#\n",
    "# kwd_extract(text=\"Artificial Intelligence: New Frontiers in Real-Time Inverse Scattering and Electromagnetic Imaging - In recent years, artificial intelligence (AI) techniques have been developed rapidly. With the ...\")"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "outputs": [
    {
     "data": {
      "text/plain": "Index(['Publication Type', 'Authors', 'Book Authors', 'Book Editors',\n       'Book Group Authors', 'Author Full Names', 'Book Author Full Names',\n       'Group Authors', 'Article Title', 'Source Title', 'Book Series Title',\n       'Book Series Subtitle', 'Language', 'Document Type', 'Conference Title',\n       'Conference Date', 'Conference Location', 'Conference Sponsor',\n       'Conference Host', 'Author Keywords', 'Keywords Plus', 'Abstract',\n       'Addresses', 'Affiliations', 'Reprint Addresses', 'Email Addresses',\n       'Researcher Ids', 'ORCIDs', 'Funding Orgs', 'Funding Name Preferred',\n       'Funding Text', 'Cited References', 'Cited Reference Count',\n       'Times Cited, WoS Core', 'Times Cited, All Databases',\n       '180 Day Usage Count', 'Since 2013 Usage Count', 'Publisher',\n       'Publisher City', 'Publisher Address', 'ISSN', 'eISSN', 'ISBN',\n       'Journal Abbreviation', 'Journal ISO Abbreviation', 'Publication Date',\n       'Publication Year', 'Volume', 'Issue', 'Part Number', 'Supplement',\n       'Special Issue', 'Meeting Abstract', 'Start Page', 'End Page',\n       'Article Number', 'DOI', 'DOI Link', 'Book DOI', 'Early Access Date',\n       'Number of Pages', 'WoS Categories', 'Web of Science Index',\n       'Research Areas', 'IDS Number', 'Pubmed Id', 'Open Access Designations',\n       'Highly Cited Status', 'Hot Paper Status', 'Date of Export',\n       'UT (Unique WOS ID)', 'issn_var', 'issn', 'Domain_English',\n       'Field_English', 'SubField_English', '2.00 SEQ', 'Source_title',\n       'srcid', 'issn_type'],\n      dtype='object')"
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wos.columns"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "geotext = GeoText()\n",
    "\n",
    "def extract_location(input_text, key='countries'):\n",
    "    anomalies = {\"Malta\":\"Malta\",\n",
    "                 \"Mongolia\":\"Mongolia\",\n",
    "                 \"Quatar\":\"Qatar\",\n",
    "                 \"Qatar\":\"Qatar\",\n",
    "                 \"Ethiop\":\"Ethiopia\",\n",
    "                 \"Nigeria\":\"Nigeria\",\n",
    "                 \"BELAR\":\"Belarus\",\n",
    "                 \"Venezuela\":\"Venezuela\",\n",
    "                 \"Cyprus\":\"Cyprus\",\n",
    "                 \"Ecuador\":\"Ecuador\",\n",
    "                 \"U Arab\":\"United Arab Emirates\",\n",
    "                 \"Syria\":\"Syria\",\n",
    "                 \"Uganda\":\"Uganda\",\n",
    "                 \"Yemen\":\"Yemen\",\n",
    "                 \"Mali\":\"Mali\",\n",
    "                 \"Senegal\":\"Senegal\",\n",
    "                 \"Vatican\":\"Vatican\",\n",
    "                 \"Uruguay\":\"Uruguay\",\n",
    "                 \"Panama\":\"Panama\",\n",
    "                 \"Fiji\":\"Fiji\",\n",
    "                 \"Faroe\":\"Faroe Islands\",\n",
    "                 \"Macedonia\":\"Macedonia\",\n",
    "                 'Mozambique':'Mozambique',\n",
    "                 \"Kuwait\":\"Kuwait\",\n",
    "                 \"Libya\":\"Libya\",\n",
    "                 \"Turkiy\":\"Turkey\",\n",
    "                 \"Liberia\":\"Liberia\",\n",
    "                 \"Namibia\":\"Namibia\",\n",
    "                 \"Ivoire\":\"Ivory Coast\",\n",
    "                 \"Guatemala\":\"Gutemala\",\n",
    "                 \"Paraguay\":\"Paraguay\",\n",
    "                 \"Honduras\":\"Honduras\",\n",
    "                 \"Nicaragua\":\"Nicaragua\",\n",
    "                 \"Trinidad\":\"Trinidad & Tobago\",\n",
    "                 \"Liechtenstein\":\"Liechtenstein\",\n",
    "                 \"Greenland\":\"Denmark\"}\n",
    "\n",
    "    extracted = geotext.extract(input_text=input_text)\n",
    "    found = extracted[key].keys()\n",
    "    if len(sorted(found))>0:\n",
    "        return sorted(found)[0]\n",
    "    elif key=='countries':\n",
    "        for i  in ['Scotland','Wales','England', 'N Ireland']:\n",
    "            if i in input_text:\n",
    "                return 'United Kingdom'\n",
    "        for j in anomalies.keys():\n",
    "            if j in input_text:\n",
    "                return anomalies.get(j)\n",
    "    else:\n",
    "        return None\n",
    "\n",
    "with open('../eu_members.txt',\"r\") as f:\n",
    "    eu_countries=f.readline().split(\",\")\n",
    "    eu_countries=[i.strip() for i in eu_countries]\n",
    "\n",
    "def country_type(country):\n",
    "    if country in eu_countries:\n",
    "        return \"EU\"\n",
    "    elif country==\"China\":\n",
    "        return \"China\"\n",
    "    elif country in [\"Switzerland\", 'Norway','United Kingdom']:\n",
    "        return \"Non-EU associate\"\n",
    "    else:\n",
    "        return \"Other\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "locations = wos.groupby(record_col)[\"Addresses\"].apply(lambda x: x.str.split('[')).explode().reset_index().drop(columns=\"level_1\")\n",
    "locations = locations[locations[\"Addresses\"]!=\"\"].copy()\n",
    "locations[\"Address\"] = locations[\"Addresses\"].apply(lambda x:x.split(\"]\")[-1])\n",
    "locations[\"Authors_of_address\"] = locations[\"Addresses\"].apply(lambda x:x.split(\"]\")[0])\n",
    "locations[\"Country\"]=locations['Address'].apply(lambda x: extract_location(input_text=x, key='countries'))\n",
    "locations[\"City\"]=locations['Address'].apply(lambda x: extract_location(input_text=x, key='cities'))\n",
    "locations[\"Country_Type\"] = locations[\"Country\"].apply(lambda x: country_type(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "outputs": [],
   "source": [
    "scope_types = [\"EU\",\"China\",\"Non-EU associate\"]\n",
    "locations=locations[locations[\"Country_Type\"].isin(scope_types)]"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "    UT (Unique WOS ID)                                            Address   \n1  WOS:000208863600266   Univ Bergen, Ctr Geobiol, Dept Biol, N-5020 B...  \\\n2  WOS:000208863600266   Chinese Acad Sci, Guangzhou Inst Geochem, Gua...   \n3  WOS:000208863600266   Univ Bergen, Dept Earth Sci, N-5020 Bergen, N...   \n5  WOS:000209236900010   Goethe Univ Frankfurt, Frankfurt Inst Adv Stu...   \n6  WOS:000209236900010   Ecole Normale Super Cachan Bretagne, Bruz, Fr...   \n\n   Country              City      Country_Type   \n1   Norway            Bergen  Non-EU associate  \\\n2    China         Guangzhou             China   \n3   Norway            Bergen  Non-EU associate   \n5  Germany  Frankfurt (Oder)                EU   \n6   France            Cachan                EU   \n\n                            Institution  \n1                           Univ Bergen  \n2                      Chinese Acad Sci  \n3                           Univ Bergen  \n5                 Goethe Univ Frankfurt  \n6   Ecole Normale Super Cachan Bretagne  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>UT (Unique WOS ID)</th>\n      <th>Address</th>\n      <th>Country</th>\n      <th>City</th>\n      <th>Country_Type</th>\n      <th>Institution</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>1</th>\n      <td>WOS:000208863600266</td>\n      <td>Univ Bergen, Ctr Geobiol, Dept Biol, N-5020 B...</td>\n      <td>Norway</td>\n      <td>Bergen</td>\n      <td>Non-EU associate</td>\n      <td>Univ Bergen</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>WOS:000208863600266</td>\n      <td>Chinese Acad Sci, Guangzhou Inst Geochem, Gua...</td>\n      <td>China</td>\n      <td>Guangzhou</td>\n      <td>China</td>\n      <td>Chinese Acad Sci</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>WOS:000208863600266</td>\n      <td>Univ Bergen, Dept Earth Sci, N-5020 Bergen, N...</td>\n      <td>Norway</td>\n      <td>Bergen</td>\n      <td>Non-EU associate</td>\n      <td>Univ Bergen</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>WOS:000209236900010</td>\n      <td>Goethe Univ Frankfurt, Frankfurt Inst Adv Stu...</td>\n      <td>Germany</td>\n      <td>Frankfurt (Oder)</td>\n      <td>EU</td>\n      <td>Goethe Univ Frankfurt</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>WOS:000209236900010</td>\n      <td>Ecole Normale Super Cachan Bretagne, Bruz, Fr...</td>\n      <td>France</td>\n      <td>Cachan</td>\n      <td>EU</td>\n      <td>Ecole Normale Super Cachan Bretagne</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "univ_locations = locations[[record_col,\"Address\",\"Country\",\"City\",\"Country_Type\"]].copy()\n",
    "univ_locations[\"Institution\"] = univ_locations[\"Address\"].apply(lambda x: x.split(\",\")[0])\n",
    "univ_locations = univ_locations.drop_duplicates()\n",
    "univ_locations.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "outputs": [],
   "source": [
    "import hashlib\n",
    "\n",
    "def md5hash(s: str):\n",
    "    return hashlib.md5(s.encode('utf-8')).hexdigest()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "    UT (Unique WOS ID) Country      Country_Type   \n0  WOS:000208863600266   China             China  \\\n1  WOS:000208863600266  Norway  Non-EU associate   \n2  WOS:000208863600266  Norway  Non-EU associate   \n3  WOS:000208863600266  Norway  Non-EU associate   \n4  WOS:000208863600266  Norway  Non-EU associate   \n\n                      author_str_id  \n0  5dfb4f0408a2cc8b7f36f5516938b62c  \n1  d603b89121a1f279bf03b6f65d1389fa  \n2  2fcb84e544f1558ead61dcf846027b7d  \n3  6550a1d5fbd1b643f4732d40f2ed4d78  \n4  56485e2bd170d199887af88f3d0a9777  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>UT (Unique WOS ID)</th>\n      <th>Country</th>\n      <th>Country_Type</th>\n      <th>author_str_id</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>WOS:000208863600266</td>\n      <td>China</td>\n      <td>China</td>\n      <td>5dfb4f0408a2cc8b7f36f5516938b62c</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>WOS:000208863600266</td>\n      <td>Norway</td>\n      <td>Non-EU associate</td>\n      <td>d603b89121a1f279bf03b6f65d1389fa</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>WOS:000208863600266</td>\n      <td>Norway</td>\n      <td>Non-EU associate</td>\n      <td>2fcb84e544f1558ead61dcf846027b7d</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>WOS:000208863600266</td>\n      <td>Norway</td>\n      <td>Non-EU associate</td>\n      <td>6550a1d5fbd1b643f4732d40f2ed4d78</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>WOS:000208863600266</td>\n      <td>Norway</td>\n      <td>Non-EU associate</td>\n      <td>56485e2bd170d199887af88f3d0a9777</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "author_locations = locations.groupby([record_col,\"Country\",\"Country_Type\"])[\"Authors_of_address\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_3\")\n",
    "author_locations[\"Author_name\"] = author_locations[\"Authors_of_address\"].str.strip()\n",
    "author_locations = author_locations.drop(columns=\"Authors_of_address\")\n",
    "author_locations[\"author_str_id\"] = author_locations[\"Author_name\"].apply(lambda x:''.join(filter(str.isalnum, x.lower())))\n",
    "author_locations[\"author_str_id\"] = author_locations[\"author_str_id\"].apply(md5hash)\n",
    "author_locations = author_locations.drop(columns=\"Author_name\")\n",
    "author_locations.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "outputs": [
    {
     "data": {
      "text/plain": "         UT (Unique WOS ID)         Country      Country_Type   \n0       WOS:000208863600266           China             China  \\\n6       WOS:000209236900010           China             China   \n7       WOS:000209236900010           China             China   \n8       WOS:000209236900010          France                EU   \n10      WOS:000209236900010         Germany                EU   \n...                     ...             ...               ...   \n321236  WOS:000953367000001           China             China   \n321237  WOS:000953367000001           China             China   \n321238  WOS:000953367000001           China             China   \n321239  WOS:000953367000001           China             China   \n321241  WOS:000953367000001  United Kingdom  Non-EU associate   \n\n                           author_str_id  \n0       5dfb4f0408a2cc8b7f36f5516938b62c  \n6       b406b8485c286091a46aca4999f294d3  \n7       abf37b879540b7c2eeb86787a467de29  \n8       2c559a54c654ab6dbc23d20ae82a0501  \n10      2c559a54c654ab6dbc23d20ae82a0501  \n...                                  ...  \n321236  99ef5c82ba66e07f9aa2d3f9fc7c45f7  \n321237  d013bf53d094540f90db9224b3eb9922  \n321238  702962f6fe47bac08520ae556a8e0e02  \n321239  99ef5c82ba66e07f9aa2d3f9fc7c45f7  \n321241  9cc42be570a5464bca0ea4b6b39d0271  \n\n[277884 rows x 4 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>UT (Unique WOS ID)</th>\n      <th>Country</th>\n      <th>Country_Type</th>\n      <th>author_str_id</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>WOS:000208863600266</td>\n      <td>China</td>\n      <td>China</td>\n      <td>5dfb4f0408a2cc8b7f36f5516938b62c</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>WOS:000209236900010</td>\n      <td>China</td>\n      <td>China</td>\n      <td>b406b8485c286091a46aca4999f294d3</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>WOS:000209236900010</td>\n      <td>China</td>\n      <td>China</td>\n      <td>abf37b879540b7c2eeb86787a467de29</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>WOS:000209236900010</td>\n      <td>France</td>\n      <td>EU</td>\n      <td>2c559a54c654ab6dbc23d20ae82a0501</td>\n    </tr>\n    <tr>\n      <th>10</th>\n      <td>WOS:000209236900010</td>\n      <td>Germany</td>\n      <td>EU</td>\n      <td>2c559a54c654ab6dbc23d20ae82a0501</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>321236</th>\n      <td>WOS:000953367000001</td>\n      <td>China</td>\n      <td>China</td>\n      <td>99ef5c82ba66e07f9aa2d3f9fc7c45f7</td>\n    </tr>\n    <tr>\n      <th>321237</th>\n      <td>WOS:000953367000001</td>\n      <td>China</td>\n      <td>China</td>\n      <td>d013bf53d094540f90db9224b3eb9922</td>\n    </tr>\n    <tr>\n      <th>321238</th>\n      <td>WOS:000953367000001</td>\n      <td>China</td>\n      <td>China</td>\n      <td>702962f6fe47bac08520ae556a8e0e02</td>\n    </tr>\n    <tr>\n      <th>321239</th>\n      <td>WOS:000953367000001</td>\n      <td>China</td>\n      <td>China</td>\n      <td>99ef5c82ba66e07f9aa2d3f9fc7c45f7</td>\n    </tr>\n    <tr>\n      <th>321241</th>\n      <td>WOS:000953367000001</td>\n      <td>United Kingdom</td>\n      <td>Non-EU associate</td>\n      <td>9cc42be570a5464bca0ea4b6b39d0271</td>\n    </tr>\n  </tbody>\n</table>\n<p>277884 rows × 4 columns</p>\n</div>"
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "author_locations[author_locations['author_str_id'].duplicated(False)]"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "author_primary_region = author_locations.sort_values(by=\"Country_Type\").drop_duplicates(subset=[record_col,\"author_str_id\"])\n",
    "# author_primary_region\n",
    "\n",
    "china=author_primary_region[author_primary_region[\"Country_Type\"]==\"China\"][record_col].unique()\n",
    "eu=author_primary_region[author_primary_region[\"Country_Type\"]==\"EU\"][record_col].unique()\n",
    "assoc=author_primary_region[author_primary_region[\"Country_Type\"]==\"Non-EU associate\"][record_col].unique()\n",
    "\n",
    "\n",
    "# records that have distinct authors with different country affiliations\n",
    "valid_scope = wos[((wos[record_col].isin(china))\n",
    "         &\n",
    "         ((wos[record_col].isin(eu))\n",
    "         |\n",
    "         (wos[record_col].isin(assoc))))][record_col].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "outputs": [
    {
     "data": {
      "text/plain": "         UT (Unique WOS ID) Country Country_Type   \n0       WOS:000208863600266   China        China  \\\n299168  WOS:000840488600001   China        China   \n299169  WOS:000840488600001   China        China   \n101376  WOS:000434663200012   China        China   \n101374  WOS:000434663200012   China        China   \n\n                           author_str_id  \n0       5dfb4f0408a2cc8b7f36f5516938b62c  \n299168  3462304c908993a828cdd0ff91ea4aaa  \n299169  68ab59c442eb882af13a8273439cf840  \n101376  304c36b8b677f41a489894dc66a8461c  \n101374  c04795fe195dcadb58bed5c81125ea35  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>UT (Unique WOS ID)</th>\n      <th>Country</th>\n      <th>Country_Type</th>\n      <th>author_str_id</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>WOS:000208863600266</td>\n      <td>China</td>\n      <td>China</td>\n      <td>5dfb4f0408a2cc8b7f36f5516938b62c</td>\n    </tr>\n    <tr>\n      <th>299168</th>\n      <td>WOS:000840488600001</td>\n      <td>China</td>\n      <td>China</td>\n      <td>3462304c908993a828cdd0ff91ea4aaa</td>\n    </tr>\n    <tr>\n      <th>299169</th>\n      <td>WOS:000840488600001</td>\n      <td>China</td>\n      <td>China</td>\n      <td>68ab59c442eb882af13a8273439cf840</td>\n    </tr>\n    <tr>\n      <th>101376</th>\n      <td>WOS:000434663200012</td>\n      <td>China</td>\n      <td>China</td>\n      <td>304c36b8b677f41a489894dc66a8461c</td>\n    </tr>\n    <tr>\n      <th>101374</th>\n      <td>WOS:000434663200012</td>\n      <td>China</td>\n      <td>China</td>\n      <td>c04795fe195dcadb58bed5c81125ea35</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "author_primary_region.head()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of records: 24653\n",
      "Number of valid records: 22081\n"
     ]
    }
   ],
   "source": [
    "print(f'Number of records: {len(wos)}')\n",
    "print(f'Number of valid cooperation records: {len(valid_scope)}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "outputs": [],
   "source": [
    "wos = wos[wos[record_col].isin(valid_scope)]\n",
    "locations = locations[locations[record_col].isin(valid_scope)]\n",
    "univ_locations = univ_locations[univ_locations[record_col].isin(valid_scope)]\n",
    "author_locations = author_locations[author_locations[record_col].isin(valid_scope)]\n",
    "author_primary_region = author_locations[author_locations[record_col].isin(valid_scope)]"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "affiliations = wos.groupby(record_col)[\"Affiliations\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
    "affiliations[\"Affiliations\"] = affiliations[\"Affiliations\"].str.strip().str.upper().fillna(\"UNKNOWN\")\n",
    "affiliations = affiliations.drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "outputs": [
    {
     "data": {
      "text/plain": "Affiliations\nCHINESE ACADEMY OF SCIENCES                            2688\nUNIVERSITY OF LONDON                                   1251\nUDICE-FRENCH RESEARCH UNIVERSITIES                     1038\nCENTRE NATIONAL DE LA RECHERCHE SCIENTIFIQUE (CNRS)     978\nTSINGHUA UNIVERSITY                                     960\n                                                       ... \nITALIAN INSTITUTE FOR GENOMIC MEDICINE (IIGM)             1\nSHENYANG INSTITUTE OF ENGINEERING                         1\nXIANYANG NORMAL UNIVERSITY                                1\nAGILENT TECHNOLOGIES                                      1\nUNIVERSIDAD DE ESPECIALIDADES ESPIRITU SANTO              1\nName: count, Length: 6117, dtype: int64"
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "affiliations[\"Affiliations\"].value_counts()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "outputs": [
    {
     "data": {
      "text/plain": "Institution\n Chinese Acad Sci                                     2708\n Tsinghua Univ                                        1170\n Shanghai Jiao Tong Univ                               978\n Zhejiang Univ                                         902\n Univ Chinese Acad Sci                                 753\n                                                      ... \n Univ Namur                                              1\n Qianhai Inst Innovat Res                                1\n UN                                                      1\n Vienna Int Ctr                                          1\n Engn Res Ctr Urban Underground Space Dev Zhejiang       1\nName: count, Length: 11670, dtype: int64"
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "univ_locations[\"Institution\"].value_counts()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "outputs": [
    {
     "data": {
      "text/plain": "22081"
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "univ_locations[record_col].nunique()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "outputs": [
    {
     "data": {
      "text/plain": "22081"
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "affiliations[record_col].nunique()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "outputs": [
    {
     "data": {
      "text/plain": "99343"
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "univ_locations[\"Institution\"].value_counts().sum()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "outputs": [
    {
     "data": {
      "text/plain": "130533"
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "affiliations[\"Affiliations\"].value_counts().sum()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "158916 162684\n"
     ]
    }
   ],
   "source": [
    "aff_ = wos.groupby(record_col)[\"Affiliations\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
    "loc_ = wos.groupby(record_col)[\"Addresses\"].apply(lambda x: x.str.split('[')).explode().reset_index().drop(columns=\"level_1\")\n",
    "print(len(aff_),len(loc_))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "unique_inst = sorted([i.split(\" \") for i in list(affiliations[\"Affiliations\"].unique())], key=len)\n",
    "# unique_inst = [[''.join(filter(str.isalnum, i)) for i in i_list] for i_list in unique_inst]\n",
    "unique_inst = [[i.strip(\",\").strip(\"(\").strip(\")\") for i in i_list] for i_list in unique_inst]\n",
    "unique_inst"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def institution_chunk_norris(text):\n",
    "    for i in unique_inst:\n",
    "        text_split=text.split(\" \")\n",
    "        text_split=[i.strip(\",\").strip(\"(\").strip(\")\") for i in text_split]\n",
    "        overlap = all(token in text_split for token in i)\n",
    "        if overlap:\n",
    "            return (\" \".join(i))\n",
    "    return \"ERROR\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "affiliations[\"Affiliations_merged\"] = affiliations[\"Affiliations\"].apply(lambda x: institution_chunk_norris(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "affiliations[\"Affiliations\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "affiliations[\"Affiliations_merged\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "affiliations[affiliations[\"Affiliations_merged\"]==\"ERROR\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.metrics import edit_distance\n",
    "from nltk.metrics import edit_distance_align\n",
    "#results = df.apply(lambda x: edit_distance(x[\"column1\"], x[\"column2\"]), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "affiliations = affiliations.merge(univ_locations, on=record_col)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "affiliations[\"Affiliations\"] = affiliations[\"Affiliations\"].str.upper().str.strip()\n",
    "affiliations[\"Institution\"] = affiliations[\"Institution\"].str.upper().str.strip()\n",
    "\n",
    "affiliations[\"levehnstein\"] = affiliations.apply(\n",
    "    lambda x: edit_distance(x[\"Affiliations\"], x[\"Institution\"]), axis=1)\n",
    "affiliations.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "def tok_overlap(lon_str, short_str):\n",
    "    l,s = lon_str.split(\" \"), short_str.split(\" \")\n",
    "    # create a pairwise distance matrix using NumPy\n",
    "    distance_matrix = np.fromfunction(np.vectorize(lambda i, j: edit_distance(l[int(i)], s[int(j)])), shape=(len(l), len(s)))\n",
    "    distance_frame = pd.DataFrame(data=distance_matrix, columns=s, index=l)\n",
    "\n",
    "    return min(distance_frame.min().sum(),distance_frame.T.min().sum())\n",
    "\n",
    "# lon=(\"UNIVERSITY\",\"AMSTERDAM\",\"TECHNICAL\", \"LOCAL\")\n",
    "# sho=(\"UNIV\",\"AMSTER\",\"TECH\",\"LOCAL\")\n",
    "# tok_overlap(lon_str=\" \".join(lon),short_str=\" \".join(sho)).min().sum()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "tok_overlap(lon_str=\" \".join(l),short_str=\" \".join(s)).shape"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "affiliations[\"token_overlap\"] = affiliations.apply(\n",
    "    lambda x: tok_overlap(x[\"Affiliations\"], x[\"Institution\"]), axis=1)\n",
    "affiliations.head()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "affiliations.sort_values(by=[record_col,\"Affiliations\",\"token_overlap\"], ascending=[False,False,True]).drop_duplicates(subset=[record_col,\"Affiliations\"])"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "helper = affiliations.sort_values(by=[\"Affiliations\",\"token_overlap\"], ascending=[False,True]).drop_duplicates(subset=[record_col,\"Affiliations\"])\n",
    "afh = helper[[\"Affiliations\",\"Institution\",\"Country\"]]\n",
    "afh.groupby(\"Affiliations\")[\"Institution\"].agg(pd.Series.mode)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "afh.groupby(\"Affiliations\")[\"Country\"].agg(pd.Series.mode)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "helper1 = affiliations.sort_values(by=[\"Affiliations\",\"token_overlap\"], ascending=[False,True]).drop_duplicates(subset=[record_col,\"Affiliations\"])\n",
    "afh1 = helper1[[\"Affiliations\",\"Institution\",\"City\",\"Country\",\"Country_Type\"]]\n",
    "mode1_i = afh1.groupby(\"Affiliations\")[\"Institution\"].apply(pd.Series.mode).reset_index()\n",
    "mode1_c = afh1.groupby(\"Affiliations\")[\"Country\"].apply(pd.Series.mode).reset_index()\n",
    "mode1_city = afh1.groupby(\"Affiliations\")[\"City\"].apply(pd.Series.mode).reset_index()\n",
    "mode1_type = afh1.groupby(\"Affiliations\")[\"Country_Type\"].apply(pd.Series.mode).reset_index()\n",
    "\n",
    "helper2 = affiliations.sort_values(by=[\"Affiliations\",\"levehnstein\"], ascending=[False,True]).drop_duplicates(subset=[record_col,\"Affiliations\"])\n",
    "afh2 = helper2[[\"Affiliations\",\"Institution\",\"City\",\"Country\",\"Country_Type\"]]\n",
    "mode2_i = afh2.groupby(\"Affiliations\")[\"Institution\"].apply(pd.Series.mode).reset_index()\n",
    "mode2_c = afh2.groupby(\"Affiliations\")[\"Country\"].apply(pd.Series.mode).reset_index()\n",
    "mode2_city = afh2.groupby(\"Affiliations\")[\"City\"].apply(pd.Series.mode).reset_index()\n",
    "mode2_type = afh2.groupby(\"Affiliations\")[\"Country_Type\"].apply(pd.Series.mode).reset_index()\n",
    "\n",
    "mode_i = pd.concat([mode1_i,mode2_i],ignore_index=True)[[\"Affiliations\",\"Institution\"]].groupby(\"Affiliations\")[\"Institution\"].agg(\n",
    "    lambda x: pd.Series.mode(x)[0])\n",
    "mode_c = pd.concat([mode1_c,mode2_c],ignore_index=True)[[\"Affiliations\",\"Country\"]].groupby(\"Affiliations\")[\"Country\"].agg(\n",
    "    lambda x: pd.Series.mode(x)[0])\n",
    "mode_city = pd.concat([mode1_city,mode2_city],ignore_index=True)[[\"Affiliations\",\"City\"]].groupby(\"Affiliations\")[\"City\"].agg(\n",
    "    lambda x: pd.Series.mode(x)[0])\n",
    "mode_type = pd.concat([mode1_type,mode2_type],ignore_index=True)[[\"Affiliations\",\"Country_Type\"]].groupby(\"Affiliations\")[\"Country_Type\"].agg(\n",
    "    lambda x: pd.Series.mode(x)[0])"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "from functools import reduce\n",
    "dfs = [mode_i, mode_c, mode_city, mode_type]\n",
    "mode_final = reduce(lambda left,right: pd.merge(left,right,on='Affiliations'), dfs)\n",
    "mode_final = mode_final.reset_index()\n",
    "mode_final.columns = [\"Affiliations\",\"Institution (short name from address)\",\"Country_candidate\",\"City_candidate\",\"Country_type_candidate\"]\n",
    "mode_final"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "aff_lookup = affiliations[[\"Affiliations\",\"Institution\",\"levehnstein\"]].drop_duplicates().sort_values(by=[\"Affiliations\",\"levehnstein\"],ascending=[True,True])\n",
    "aff_lookup"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "aff_lookup_levehnstein = aff_lookup.copy()\n",
    "aff_lookup_overlap = aff_lookup.copy()\n",
    "inst_short = sorted([i.split(\" \") for i in list(aff_lookup_overlap[\"Institution\"].unique())], key=len)\n",
    "inst_short"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "aff_lookup.drop_duplicates(subset=\"Affiliations\")"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "# aff_m = pd.DataFrame(affiliations[\"Affiliations\"].unique(), columns=[\"Affiliations\"])\n",
    "# inst_m = pd.DataFrame(affiliations[[\"Institution\",\"Country_Type\",\"Country\",\"City\"]].drop_duplicates(),columns=[\"Institution\",\"Country_Type\",\"Country\",\"City\"])\n",
    "#\n",
    "# aff_lookup = aff_m.merge(inst_m, how='cross')\n",
    "#\n",
    "# # aff_lookup[\"levehnstein\"] = aff_lookup.apply(\n",
    "# #     lambda x: edit_distance(x[\"Affiliations\"], x[\"Institution\"]), axis=1)\n",
    "#\n",
    "# aff_lookup.assign(distance=[*map(edit_distance, aff_lookup.Affiliations, aff_lookup.Institution)])"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "affiliations[\"levehnstein\"].plot(kind=\"hist\")"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "affiliations[\"token_overlap\"].plot(kind=\"hist\")"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "affiliations[affiliations[\"Affiliations\"].str.contains(\"A*STAR\",regex=False)]"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "affiliations = affiliations.sort_values(by=[record_col,\"Affiliations\",\"levehnstein\"], ascending=[False,False,True])\n",
    "affiliations_merge = affiliations.drop_duplicates(subset=[record_col,\"Affiliations\"])\n",
    "affiliations_merge.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "wos_cat = wos.groupby(record_col)[\"WoS Categories\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
    "wos_cat[\"WoS Categories\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "wos_areas = wos.groupby(record_col)[\"Research Areas\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
    "wos_areas[\"Research Areas\"] = wos_areas[\"Research Areas\"].str.strip()\n",
    "wos_areas[\"Research Areas\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "wos_areas[\"Research Areas\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "[c for c in wos.columns if \"_English\" in c]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import seaborn as sns\n",
    "from matplotlib.ticker import MaxNLocator\n",
    "import math"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "wos = wos[((wos[\"Publication Year\"]<2023) & (~wos['Domain_English'].isna()))]\n",
    "\n",
    "metrix_levels = [c for c in wos.columns if \"_English\" in c]\n",
    "for m in metrix_levels:\n",
    "    wos[m] = wos[m].replace({\"article-level classification\":\"Miscellaneous\"})\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "wos"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "metrix_levels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "outdir=\"wos_processed_data\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "os.makedirs(outdir, exist_ok=True)\n",
    "\n",
    "wos.to_excel(f\"{outdir}/wos_processed.xlsx\", index=False)\n",
    "\n",
    "locations.drop(columns=\"Addresses\").to_excel(f\"{outdir}/wos_addresses.xlsx\", index=False)\n",
    "\n",
    "affiliations_merge.to_excel(f\"{outdir}/wos_affiliations.xlsx\", index=False)\n",
    "\n",
    "author_locations.to_excel(f\"{outdir}/wos_author_locations.xlsx\", index=False)\n",
    "\n",
    "univ_locations.to_excel(f\"{outdir}/wos_univ_locations.xlsx\", index=False)\n",
    "mode_final.to_excel(f\"{outdir}/wos_univ_locations_v2.xlsx\", index=False)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "kw_df.to_excel(f\"{outdir}/keywords.xlsx\", index=False)\n",
    "wos_nlp.to_excel(f\"{outdir}/wos_nlp.xlsx\", index=False)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "# wos_nlp = wos[[record_col,\"Article Title\",\"Abstract\"]]\n",
    "wos = wos.merge(wos_kwd_concat, on=record_col)\n",
    "wos[\"Document\"] = wos[\"Article Title\"].str.cat(wos[[\"Abstract\", \"keyword_all\"]].fillna(\"\"), sep=' - ')\n",
    "# wos_kwd_test[\"BERT_KWDS\"] = wos_kwd_test[\"Document\"].map(kwd_extract)\n",
    "\n",
    "vectors = list()\n",
    "vector_norms = list()\n",
    "\n",
    "for doc in nlp.pipe(wos['Document'].astype('unicode').values, batch_size=100,\n",
    "                    n_process=4):\n",
    "    vectors.append(doc.vector)\n",
    "    vector_norms.append(doc.vector_norm)\n",
    "\n",
    "wos['vector'] = vectors\n",
    "wos['vector_norm'] = vector_norms\n",
    "wos['vector_norm'].plot(kind=\"hist\")\n",
    "from sklearn.manifold import TSNE\n",
    "import matplotlib.pyplot as plt\n",
    "% matplotlib inline\n",
    "\n",
    "vector_data = pd.DataFrame(wos[\"vector\"].to_list(), index=wos[record_col]).reset_index()\n",
    "vector_data.head()\n",
    "\n",
    "labels = vector_data.values[:, 0]\n",
    "record_vectors = vector_data.values[:, 1:]\n",
    "\n",
    "tsne_model = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, random_state=42, metric='cosine')\n",
    "tnse_2d = tsne_model.fit_transform(record_vectors)\n",
    "tnse_data = pd.DataFrame(tnse_2d, index=labels).reset_index()\n",
    "tnse_data.columns = [record_col, \"TNSE-X\", \"TNSE-Y\"]\n",
    "tnse_data.head()\n",
    "import seaborn as sns\n",
    "\n",
    "wos_plot = wos.merge(tnse_data, on=record_col)\n",
    "\n",
    "g = sns.scatterplot(wos_plot[wos_plot[\"Domain_English\"] != 'article-level classification'], x=\"TNSE-X\", y=\"TNSE-Y\",\n",
    "                    hue='Domain_English', s=1)\n",
    "g.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\n",
    "wos_plot.head()\n",
    "wos_nlp = wos_plot[[record_col, \"Document\", \"keyword_all\", \"TNSE-X\", \"TNSE-Y\"]]\n",
    "g = sns.kdeplot(\n",
    "    data=wos_plot[wos_plot[\"Domain_English\"] != 'article-level classification'],\n",
    "    x=\"TNSE-X\", y=\"TNSE-Y\", hue='Domain_English',\n",
    "    thresh=.1,\n",
    ")\n",
    "wos.columns"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Domain"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "group = 'Domain_English'\n",
    "data = wos.groupby(group, as_index=False)[record_col].nunique().sort_values(ascending=False, by=record_col)\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.barplot(data, x=record_col, y=group)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# group = ['Publication Year','Domain_English']\n",
    "# data = wos.groupby(group, as_index=False)[record_col].nunique().sort_values(ascending=False, by=group+[record_col])\n",
    "# data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# group = ['Publication Year','Domain_English']\n",
    "# data = wos.groupby(group)[record_col].nunique().unstack(fill_value=0).stack().reset_index().rename(columns={0:record_col}).sort_values(ascending=False, by=group+[record_col])\n",
    "# data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# g=sns.lineplot(data.sort_values(ascending=True, by=group[-1]),y=record_col,x=group[0], hue=group[-1])\n",
    "# g.set(xticks=list(range(2012,2022+1,2)))\n",
    "# g.legend(title=None)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Field"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# group = ['Publication Year',\"Domain_English\",'Field_English']\n",
    "# data = wos.groupby(group, as_index=False)[record_col].nunique().sort_values(ascending=False, by=group+[record_col])\n",
    "# data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# g = sns.FacetGrid(data, col=\"Domain_English\", col_wrap=3, height=5)\n",
    "# g.map_dataframe(sns.lineplot,x=group[0],y=record_col,hue=group[-1])\n",
    "# g.set_titles(col_template=\"{col_name}\")\n",
    "# g.set(xticks=list(range(2012,2022+1,2)))\n",
    "# # g.add_legend()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import matplotlib.pyplot as plt\n",
    "# for cat in sorted(data[group[-2]].unique()):\n",
    "#     sub_data = data[data[group[-2]]==cat]\n",
    "#     sub_data = sub_data.complete({group[0]:range(int(data[group[0]].min()), int(data[group[0]].max()) + 1)}\n",
    "#                                  ,group[-1],fill_value=0)\n",
    "#     g=sns.lineplot(sub_data.sort_values(ascending=True, by=group[-1]),y=record_col,x=group[0], hue=group[-1])\n",
    "#     g.set(xticks=list(range(2012,2022+1,2)))\n",
    "#     g.legend(title=None)\n",
    "#     g.set_title(cat)\n",
    "#     g.yaxis.set_major_locator(MaxNLocator(integer=True))\n",
    "#     plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# SubField"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# group = ['Publication Year',\"Domain_English\",'Field_English',\"SubField_English\"]\n",
    "# data = wos.groupby(group, as_index=False)[record_col].nunique().sort_values(ascending=False, by=group+[record_col])\n",
    "# data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import matplotlib.pyplot as plt\n",
    "# for cat in sorted(data[group[-2]].unique()):\n",
    "#     sub_data = data[data[group[-2]]==cat]\n",
    "#     sub_data = sub_data.complete({group[0]:range(int(data[group[0]].min()), int(data[group[0]].max()) + 1)}\n",
    "#                                  ,group[-1],fill_value=0)\n",
    "#     g=sns.lineplot(sub_data.sort_values(ascending=True, by=group[-1]),y=record_col,x=group[0], hue=group[-1])\n",
    "#     g.set(xticks=list(range(2012,2022+1,2)))\n",
    "#     g.legend(title=None,bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., ncols=math.ceil(len(g.legend_.texts)/12))\n",
    "#     g.set_title(cat)\n",
    "#     plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}