added new keywords including refined syntax

2 years ago · 7ad4a02854
parent 35e731f31d
commit 7ad4a02854
13 changed files with 6860 additions and 622 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,4 @@
 /PATSTAT/EU_CH_scope/cpc_defs.csv
 /misc_code/
+/PATSTAT/appln_data.xlsx
+/PATSTAT/person_data.xlsx
--- a/PATSTAT/patstat_analysis_pipeline.ipynb
+++ b/PATSTAT/patstat_analysis_pipeline.ipynb
--- a/PBI/ZSI.pbix
+++ b/PBI/ZSI.pbix
--- a/TODO.ipynb
+++ b/TODO.ipynb
@ -25,10 +25,7 @@
    "- Fetched CPC description data taxonomy, merged with PATSTAT: lost around one percent of the records"
   ],
   "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%% md\n"
-    }
+    "collapsed": false
   }
  },
  {
@ -39,10 +36,7 @@
    "\"\"\"(TS=(\"artificial intelligence\") OR TS=(\"machine learning\") OR TS=(\"neural network\") OR TS=(\"big data\") OR TS=(\"deep learning\")) AND (CU=PEOPLES R CHINA AND (CU=AUSTRIA OR CU=BELGIUM OR CU=BULGARIA OR CU=CROATIA OR CU=REPUBLIC OF CYPRUS OR CU=CZECH REPUBLIC OR CU=DENMARK OR CU=ESTONIA OR CU=FINLAND OR CU=FRANCE OR CU=GERMANY OR CU=GREECE OR CU=HUNGARY OR CU=IRELAND OR CU=ITALY OR CU=LATVIA OR CU=LITHUANIA OR CU=LUXEMBOURG OR CU=MALTA OR CU=NETHERLANDS OR CU=POLAND OR CU=PORTUGAL OR CU=ROMANIA OR CU=SLOVAKIA OR CU=SLOVENIA OR CU=SPAIN OR CU=SWEDEN))\"\"\""
   ],
   "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%% md\n"
-    }
+    "collapsed": false
   }
  },
  {
@ -55,10 +49,7 @@
    "# AI ETHICS keyword!!!"
   ],
   "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%%\n"
-    }
+    "collapsed": false
   }
  },
  {
@ -69,10 +60,7 @@
    "# Only CPC classification? Or some basic PTC? (ASEAN analysis had some)"
   ],
   "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%%\n"
-    }
+    "collapsed": false
   }
  },
  {
@ -83,10 +71,7 @@
    "# Patent classes"
   ],
   "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%%\n"
-    }
+    "collapsed": false
   }
  },
  {
@ -107,10 +92,7 @@
    "- List of visuals & tables / in a specified manner/ can be"
   ],
   "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%% md\n"
-    }
+    "collapsed": false
   }
  },
  {
@ -124,10 +106,7 @@
    "# plus countries UK Norway Switzerland | Turkey Serbia"
   ],
   "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%%\n"
-    }
+    "collapsed": false
   }
  }
 ],
--- a/WOS/ai_scope_keywords.txt
+++ b/WOS/ai_scope_keywords.txt
@ -129,7 +129,7 @@ hadoop*,
 mapreduce,
 map$reduce,
 large$ dataset*,
-data warehouse*,
+data warehous*,
 predictive analytic*,
 no$sql,
 nosql,
--- a/WOS/ai_scope_keywords_suggestions.txt
+++ b/WOS/ai_scope_keywords_suggestions.txt
@ -1,15 +1,15 @@
 multi-task learning,
 multi-label classification,
-variational autoencoders,
-optimization algorithms,
+variational autoencoder*,
+optimization algorithm*,
 stochastic gradient descent,
 adaptive moment estimation,
 momentum,
-attention mechanisms,
-transformer models,
-sequence-to-sequence models,
+attention mechanism*,
+transformer model*,
+sequence-to-sequence model*,
 seq2seq,
-bidirectional encoder representations from transformers,
+bidirectional encoder representations from transformer*,
 openai gpt-3,
 openai codex,
 roberta,
@ -21,11 +21,9 @@ healthcare ai,
 affective computing,
 edge computing,
 fog computing,
-capsule networks,
-extreme learning machines,
-restricted boltzmann machines,
-linear regression,
-logistic regression,
+capsule network*,
+extreme learning machine*,
+restricted boltzmann machine*,
 markov chain monte carlo,
 hierarchical clustering,
 k-means clustering,
@ -33,23 +31,23 @@ dbscan,
 latent dirichlet allocation ,
 gaussian mixture models ,
 support vector regression ,
-multimodal learning
-multimodal representation
-cross-modal learning
-cross-lingual learning
-zero-shot learning
-few-shot learning
-one-shot learning
-continual learning
-lifelong learning
-meta-learning
-active learning
-semi-supervised learning
-instance-based learning
-rule-based learning
-knowledge graphs
-ontology learning
-semantic web
+multimodal learning,
+multimodal representation,
+cross-modal learning,
+cross-lingual learning,
+zero-shot learning,
+few-shot learning,
+one-shot learning,
+continual learning,
+lifelong learning,
+meta-learning,
+active learning,
+semi-supervised learning,
+instance-based learning,
+rule-based learning,
+knowledge graph*,
+ontology learning,
+semantic web*,
 ai applications,
 ai in finance,
 ai in medicine,
@ -59,7 +57,7 @@ ai in marketing,
 ai in transportation,
 ai in retail,
 ai in gaming,
-ai in sports,
+ai in sport*,
 ai in security,
 ai in human resources,
 ai in customer service,
@ -132,4 +130,15 @@ ai in speech synthesis,
 ai in text-to-speech,
 ai in speech-to-text,
 ai in voice cloning,
-ai in voice assistants
+ai in voice assistants,
+linear regression &! p=,
+logistic regression &! p=,
+language model*,
+data driven*,
+kalman filter*,
+target tracking,
+learning rate*,
+covariance matri*,
+loss function*,
+topic model*,
+high performance comput*
--- a/WOS/wos_extract/geckodriver.log
+++ b/WOS/wos_extract/geckodriver.log
--- a/WOS/wos_extract/wos_downloads/aggregated/2023-04-12-10-40-14-335447save/analyze_scope.txt
+++ b/WOS/wos_extract/wos_downloads/aggregated/2023-04-12-10-40-14-335447save/analyze_scope.txt
@ -1,13 +0,0 @@
-Publication Years	Record Count	% of 45 355
-2022	9081	20.022
-2021	8630	19.028
-2020	6800	14.993
-2019	5502	12.131
-2018	4087	9.011
-2017	2816	6.209
-2016	2338	5.155
-2015	1818	4.008
-2014	1571	3.464
-2013	1135	2.502
-2012	863	1.903
-2011	714	1.574
--- a/WOS/wos_extract/wos_query_generator_simplesyntax.ipynb
+++ b/WOS/wos_extract/wos_query_generator_simplesyntax.ipynb
--- a/WOS/wos_extract/wos_search_kw_analysis.ipynb
+++ b/WOS/wos_extract/wos_search_kw_analysis.ipynb
@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
@ -11,12 +11,13 @@
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "import os\n",
-    "import matplotlib.pyplot as plt"
+    "import matplotlib.pyplot as plt\n",
+    "from pandas.errors import EmptyDataError"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 16,
   "outputs": [],
   "source": [
    "agg_df = pd.DataFrame()\n",
@ -28,10 +29,15 @@
    "            path=os.path.join(root, filename)\n",
    "            with open(os.path.join(root, 'query.txt'),'r') as f:\n",
    "                query = f.readline()\n",
+    "            try:\n",
+    "                chunk = pd.read_csv(path, sep='\\t')[[\"Publication Years\",\"Record Count\"]]\n",
+    "            except EmptyDataError:\n",
+    "                path=os.path.join(root, \"analyze.txt\")\n",
    "                chunk = pd.read_csv(path, sep='\\t')[[\"Publication Years\",\"Record Count\"]]\n",
    "            chunk[\"name\"] = filename.replace(\".txt\",\"\")\n",
    "            chunk[\"query\"] = query\n",
-    "            agg_df = pd.concat([chunk,agg_df],ignore_index=True)"
+    "            agg_df = pd.concat([chunk,agg_df],ignore_index=True)\n",
+    "        # elif len(files)==1:\n"
   ],
   "metadata": {
    "collapsed": false
@ -39,12 +45,12 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 17,
   "outputs": [],
   "source": [
    "agg_df[\"region\"] = agg_df[\"query\"].apply(lambda x: \"EU+China\" if \"CU\" in x else \"Global\")\n",
-    "agg_df[\"kw_token\"] = agg_df[\"query\"].apply(lambda x: x.split(\"TS=(\")[-1].split(\")\")[0])\n",
-    "agg_df[\"kw_token\"] = agg_df[\"kw_token\"].apply(lambda x: \"OR COMPOSITE\" if \" OR \" in x else x)"
+    "agg_df[\"kw_token\"] = agg_df[\"query\"].apply(lambda x: x.split(\"TS=(\")[-1].split(\")\")[0].strip(\"(\"))\n",
+    "agg_df[\"kw_token\"] = agg_df[\"kw_token\"].apply(lambda x: \"COMPOSITE SEARCH\" if \" OR \" in x else x)"
   ],
   "metadata": {
    "collapsed": false
@ -52,7 +58,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 18,
   "outputs": [],
   "source": [
    "agg_df = agg_df[~agg_df[\"Record Count\"].isna()]"
@ -63,14 +69,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 19,
   "outputs": [
    {
     "data": {
-      "text/plain": "                                                 query  Record Count\n0    CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...         972.0\n1    CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...         451.0\n2    CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...          30.0\n3    CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...          12.0\n4    CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...           5.0\n..                                                 ...           ...\n384          TS=(\"word embedding*\") AND PY=(2011-2022)        7068.0\n385             TS=(\"word vector*\") AND PY=(2011-2022)        1747.0\n386  TS=((\"face recognition\" NOT \"brain\")) AND PY=(...       19690.0\n387  TS=((\"object detection\" NOT \"brain\")) AND PY=(...       28989.0\n388  TS=((\"speech recognition\" NOT \"brain\")) AND PY...       19912.0\n\n[389 rows x 2 columns]",
-      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>query</th>\n      <th>Record Count</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...</td>\n      <td>972.0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...</td>\n      <td>451.0</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...</td>\n      <td>30.0</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...</td>\n      <td>12.0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...</td>\n      <td>5.0</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>384</th>\n      <td>TS=(\"word embedding*\") AND PY=(2011-2022)</td>\n      <td>7068.0</td>\n    </tr>\n    <tr>\n      <th>385</th>\n      <td>TS=(\"word vector*\") AND PY=(2011-2022)</td>\n      <td>1747.0</td>\n    </tr>\n    <tr>\n      <th>386</th>\n      <td>TS=((\"face recognition\" NOT \"brain\")) AND PY=(...</td>\n      <td>19690.0</td>\n    </tr>\n    <tr>\n      <th>387</th>\n      <td>TS=((\"object detection\" NOT \"brain\")) AND PY=(...</td>\n      <td>28989.0</td>\n    </tr>\n    <tr>\n      <th>388</th>\n      <td>TS=((\"speech recognition\" NOT \"brain\")) AND PY...</td>\n      <td>19912.0</td>\n    </tr>\n  </tbody>\n</table>\n<p>389 rows × 2 columns</p>\n</div>"
+      "text/plain": "                                                 query  Record Count\n0    CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...         972.0\n1    CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...         451.0\n2    CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...          30.0\n3    CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...          12.0\n4    CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...           5.0\n..                                                 ...           ...\n543  TS=((\"face recognition\" NOT \"brain\")) AND PY=(...       19690.0\n544  TS=((\"linear regression\" NOT \"p=\")) AND PY=(20...       91493.0\n545  TS=((\"logistic regression\" NOT \"p=\")) AND PY=(...      171776.0\n546  TS=((\"object detection\" NOT \"brain\")) AND PY=(...       28989.0\n547  TS=((\"speech recognition\" NOT \"brain\")) AND PY...       19912.0\n\n[548 rows x 2 columns]",
+      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>query</th>\n      <th>Record Count</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...</td>\n      <td>972.0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...</td>\n      <td>451.0</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...</td>\n      <td>30.0</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...</td>\n      <td>12.0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...</td>\n      <td>5.0</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>543</th>\n      <td>TS=((\"face recognition\" NOT \"brain\")) AND PY=(...</td>\n      <td>19690.0</td>\n    </tr>\n    <tr>\n      <th>544</th>\n      <td>TS=((\"linear regression\" NOT \"p=\")) AND PY=(20...</td>\n      <td>91493.0</td>\n    </tr>\n    <tr>\n      <th>545</th>\n      <td>TS=((\"logistic regression\" NOT \"p=\")) AND PY=(...</td>\n      <td>171776.0</td>\n    </tr>\n    <tr>\n      <th>546</th>\n      <td>TS=((\"object detection\" NOT \"brain\")) AND PY=(...</td>\n      <td>28989.0</td>\n    </tr>\n    <tr>\n      <th>547</th>\n      <td>TS=((\"speech recognition\" NOT \"brain\")) AND PY...</td>\n      <td>19912.0</td>\n    </tr>\n  </tbody>\n</table>\n<p>548 rows × 2 columns</p>\n</div>"
     },
-     "execution_count": 5,
+     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -84,7 +90,40 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 29,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "                        kw_token  Record Count\n0               COMPOSITE SEARCH       62205.0\n1              \"neural network*\"       10999.0\n2              \"machine* learn*\"        5765.0\n3                  \"deep learn*\"        5211.0\n4                     \"momentum\"        4974.0\n..                           ...           ...\n243       \"artificial cognition\"           1.0\n244  \"ai in disaster management\"           1.0\n245          \"vector embedding*\"           1.0\n246              \"ai in finance\"           1.0\n247    \"content based filtering\"           1.0\n\n[248 rows x 2 columns]",
+      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>kw_token</th>\n      <th>Record Count</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>COMPOSITE SEARCH</td>\n      <td>62205.0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>\"neural network*\"</td>\n      <td>10999.0</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>\"machine* learn*\"</td>\n      <td>5765.0</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>\"deep learn*\"</td>\n      <td>5211.0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>\"momentum\"</td>\n      <td>4974.0</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>243</th>\n      <td>\"artificial cognition\"</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>244</th>\n      <td>\"ai in disaster management\"</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>245</th>\n      <td>\"vector embedding*\"</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>246</th>\n      <td>\"ai in finance\"</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>247</th>\n      <td>\"content based filtering\"</td>\n      <td>1.0</td>\n    </tr>\n  </tbody>\n</table>\n<p>248 rows × 2 columns</p>\n</div>"
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "kw_ranks = agg_df[agg_df[\"region\"]==\"EU+China\"].groupby(\"kw_token\",as_index=False)[\"Record Count\"].sum().sort_values(by=\"Record Count\", ascending=False).reset_index().drop(columns=\"index\")\n",
+    "kw_ranks"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "outputs": [],
+   "source": [
+    "kw_ranks.to_excel(\"kw_token_ranked.xlsx\")"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
   "outputs": [],
   "source": [
    "# agg_df = agg_df[agg_df[\"Publication Years\"].str.startswith(\"20\", na=False)].copy()\n",
@ -97,7 +136,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 22,
   "outputs": [],
   "source": [
    "# agg_df[\"Publication Years\"].value_counts()"
@ -108,7 +147,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 23,
   "outputs": [],
   "source": [
    "agg_df.to_excel(r'C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\WOS\\wos_processed_data\\query_yearly_agg.xlsx', index=False)"
--- a/WOS/wos_extract/wossel_miners.py
+++ b/WOS/wos_extract/wossel_miners.py
@ -171,6 +171,13 @@ def wos_fetch_entries(query_str="TS=\"web of science\" AND PY=(2008-2010)",

        # Renaming the file
        time.sleep(0.1)
+        try:
+            os.rename(old_name, new_name)
+        except PermissionError:
+            time.sleep(5)
+            os.rename(old_name, new_name)
+        except FileNotFoundError:
+            time.sleep(5)
            os.rename(old_name, new_name)

    time.sleep(2)
--- a/WOS/wos_nlp_demo.ipynb
+++ b/WOS/wos_nlp_demo.ipynb
--- a/WOS/wos_univ_normalizer.ipynb
+++ b/WOS/wos_univ_normalizer.ipynb
@ -0,0 +1,440 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "#  Importing libraries and module and some setting for notebook\n",
+    "\n",
+    "import pandas as pd\n",
+    "import re\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "import numpy as np\n",
+    "from scipy.sparse import csr_matrix\n",
+    "import sparse_dot_topn.sparse_dot_topn as ct  #Cosine Similarity\n",
+    "import time\n",
+    "from tqdm import tqdm"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "outputs": [],
+   "source": [
+    "def wikinorm(univ_string):\n",
+    "    from googlesearch import search\n",
+    "    from nltk.metrics import edit_distance\n",
+    "    from operator import itemgetter\n",
+    "    from numpy.random import default_rng\n",
+    "    rng = default_rng()\n",
+    "    results = search(univ_string, lang=\"en\", num_results=3,advanced=True, sleep_interval=rng.uniform(1, 5))\n",
+    "    univ_name = univ_string.split(\",\")[0]\n",
+    "    u_results = [i.title for i in results if \"Category:\" not in i.title]\n",
+    "    return sorted([tuple((j,edit_distance(univ_name, j))) for j in u_results],key=itemgetter(1))[0][0]\n"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "outputs": [],
+   "source": [
+    "def replace_uppercase_words(text):\n",
+    "    words = text.split()\n",
+    "    all_uppercase = all(word.isupper() for word in words)\n",
+    "    all_lowercase = all(word.islower() for word in words)\n",
+    "    if all_uppercase or all_lowercase:\n",
+    "        return text\n",
+    "    else:\n",
+    "        result = []\n",
+    "        for word in words:\n",
+    "            w = word.strip()\n",
+    "            if not w.isupper() and not w.islower():\n",
+    "                result.append(w)\n",
+    "        return \" \".join(result).strip()"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO: Pandarallel will run on 4 workers.\n",
+      "INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.\n",
+      "\n",
+      "WARNING: You are on Windows. If you detect any issue with pandarallel, be sure you checked out the Troubleshooting page:\n",
+      "https://nalepae.github.io/pandarallel/troubleshooting/\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": "VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=38767), Label(value='0 / 38767')))…",
+      "application/vnd.jupyter.widget-view+json": {
+       "version_major": 2,
+       "version_minor": 0,
+       "model_id": "ee2cde76498b4a46a2e87ea6c971aed9"
+      }
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "155067\n"
+     ]
+    }
+   ],
+   "source": [
+    "outdir=\"wos_processed_data\"\n",
+    "univ = pd.read_excel(f\"{outdir}/wos_institution_locations.xlsx\")\n",
+    "\n",
+    "from pandarallel import pandarallel\n",
+    "pandarallel.initialize(progress_bar=True, nb_workers=4)\n",
+    "\n",
+    "univ[\"Institution_harm\"] = univ[\"Institution\"].parallel_apply(replace_uppercase_words)\n",
+    "print(len(univ))"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "         UT (Unique WOS ID)                   Institution         Country   \n84810   WOS:000571399800004               Univ Birmingham  United Kingdom  \\\n122264  WOS:000732918800001              Univ Southampton  United Kingdom   \n135675  WOS:000799234000004                           UCL  United Kingdom   \n153134  WOS:000900724501058                Kore Univ Enna           Italy   \n51445   WOS:000455277600005                Univ Sheffield  United Kingdom   \n...                     ...                           ...             ...   \n21043   WOS:000372583700005          Vrije Univ Amsterdam     Netherlands   \n1938    WOS:000297611600011                    Univ Essex  United Kingdom   \n64691   WOS:000490430500091  Xian Jiaotong Liverpool Univ           China   \n25740   WOS:000386793200001              Chinese Acad Sci           China   \n112682  WOS:000696110800001           Dalian Univ Technol           China   \n\n                    Institution_harm  \n84810                Univ Birmingham  \n122264              Univ Southampton  \n135675                           UCL  \n153134                Kore Univ Enna  \n51445                 Univ Sheffield  \n...                              ...  \n21043           Vrije Univ Amsterdam  \n1938                      Univ Essex  \n64691   Xian Jiaotong Liverpool Univ  \n25740               Chinese Acad Sci  \n112682           Dalian Univ Technol  \n\n[100 rows x 4 columns]",
+      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>UT (Unique WOS ID)</th>\n      <th>Institution</th>\n      <th>Country</th>\n      <th>Institution_harm</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>84810</th>\n      <td>WOS:000571399800004</td>\n      <td>Univ Birmingham</td>\n      <td>United Kingdom</td>\n      <td>Univ Birmingham</td>\n    </tr>\n    <tr>\n      <th>122264</th>\n      <td>WOS:000732918800001</td>\n      <td>Univ Southampton</td>\n      <td>United Kingdom</td>\n      <td>Univ Southampton</td>\n    </tr>\n    <tr>\n      <th>135675</th>\n      <td>WOS:000799234000004</td>\n      <td>UCL</td>\n      <td>United Kingdom</td>\n      <td>UCL</td>\n    </tr>\n    <tr>\n      <th>153134</th>\n      <td>WOS:000900724501058</td>\n      <td>Kore Univ Enna</td>\n      <td>Italy</td>\n      <td>Kore Univ Enna</td>\n    </tr>\n    <tr>\n      <th>51445</th>\n      <td>WOS:000455277600005</td>\n      <td>Univ Sheffield</td>\n      <td>United Kingdom</td>\n      <td>Univ Sheffield</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>21043</th>\n      <td>WOS:000372583700005</td>\n      <td>Vrije Univ Amsterdam</td>\n      <td>Netherlands</td>\n      <td>Vrije Univ Amsterdam</td>\n    </tr>\n    <tr>\n      <th>1938</th>\n      <td>WOS:000297611600011</td>\n      <td>Univ Essex</td>\n      <td>United Kingdom</td>\n      <td>Univ Essex</td>\n    </tr>\n    <tr>\n      <th>64691</th>\n      <td>WOS:000490430500091</td>\n      <td>Xian Jiaotong Liverpool Univ</td>\n      <td>China</td>\n      <td>Xian Jiaotong Liverpool Univ</td>\n    </tr>\n    <tr>\n      <th>25740</th>\n      <td>WOS:000386793200001</td>\n      <td>Chinese Acad Sci</td>\n      <td>China</td>\n      <td>Chinese Acad Sci</td>\n    </tr>\n    <tr>\n      <th>112682</th>\n      <td>WOS:000696110800001</td>\n      <td>Dalian Univ Technol</td>\n      <td>China</td>\n      <td>Dalian Univ Technol</td>\n    </tr>\n  </tbody>\n</table>\n<p>100 rows × 4 columns</p>\n</div>"
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "univ.sample(100)"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 71,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "              Country                                   Institution_harm   \n7736           France                                       Yncrea Ouest  \\\n13752           Spain                                        Univ Carlos   \n15855  United Kingdom               Northumbria Univ Newcastle Upon Tyne   \n12514          Norway                                          Nord Univ   \n602             China                                Henan Polytech Univ   \n...               ...                                                ...   \n11620           Italy                                      Deep Blue Srl   \n11183           Italy                            Univ Giustino Fortunato   \n7433          Estonia                           Platinum Software Dev Co   \n5129            China  State & Local Joint Engn Lab Estuarine Hydraul Te   \n6799            China                                                MOA   \n\n       count  \n7736       9  \n13752      1  \n15855      1  \n12514      1  \n602       87  \n...      ...  \n11620      1  \n11183      3  \n7433       1  \n5129       1  \n6799       1  \n\n[100 rows x 3 columns]",
+      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Country</th>\n      <th>Institution_harm</th>\n      <th>count</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>7736</th>\n      <td>France</td>\n      <td>Yncrea Ouest</td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <th>13752</th>\n      <td>Spain</td>\n      <td>Univ Carlos</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>15855</th>\n      <td>United Kingdom</td>\n      <td>Northumbria Univ Newcastle Upon Tyne</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>12514</th>\n      <td>Norway</td>\n      <td>Nord Univ</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>602</th>\n      <td>China</td>\n      <td>Henan Polytech Univ</td>\n      <td>87</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>11620</th>\n      <td>Italy</td>\n      <td>Deep Blue Srl</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>11183</th>\n      <td>Italy</td>\n      <td>Univ Giustino Fortunato</td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <th>7433</th>\n      <td>Estonia</td>\n      <td>Platinum Software Dev Co</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>5129</th>\n      <td>China</td>\n      <td>State &amp; Local Joint Engn Lab Estuarine Hydraul Te</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>6799</th>\n      <td>China</td>\n      <td>MOA</td>\n      <td>1</td>\n    </tr>\n  </tbody>\n</table>\n<p>100 rows × 3 columns</p>\n</div>"
+     },
+     "execution_count": 71,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "univ_norm = univ.groupby(\"Country\", as_index=False)[\"Institution_harm\"].value_counts()\n",
+    "# univ_norm[\"search_for\"] = univ_norm[\"Institution\"]+\", \" + univ_norm[\"Country\"]+ \", wikipedia\"\n",
+    "univ_norm.sample(100)"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "outputs": [],
+   "source": [
+    "# from pandarallel import pandarallel\n",
+    "# pandarallel.initialize(progress_bar=True, nb_workers=2)\n",
+    "#\n",
+    "# df_sample[\"search_result\"] = df_sample[\"search_for\"].parallel_apply(wikinorm)"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "['Austria', 'Belgium']"
+     },
+     "execution_count": 73,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "list(univ_norm[\"Country\"].unique())[0:2]"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 95,
+   "outputs": [],
+   "source": [
+    "def ngrams(string, n=3):\n",
+    "\n",
+    "    string = re.sub(r'[,-./]|\\sBD',r'', string)\n",
+    "    ngrams = zip(*[string[i:] for i in range(n)])\n",
+    "    return [''.join(ngram) for ngram in ngrams]\n",
+    "\n",
+    "# calculate the similarity between two vectors of TF-IDF values the Cosine Similarity is usually used.\n",
+    "# result matrix in a very sparse terms and Scikit-learn deals with this nicely by returning a sparse CSR matrix.\n",
+    "\n",
+    "def awesome_cossim_top(A, B, ntop, lower_bound=0):\n",
+    "    # force A and B as a CSR matrix.\n",
+    "    # If they have already been CSR, there is no overhead\n",
+    "    A = A.tocsr()\n",
+    "    B = B.tocsr()\n",
+    "    M, _ = A.shape\n",
+    "    _, N = B.shape\n",
+    "\n",
+    "    idx_dtype = np.int32\n",
+    "\n",
+    "    nnz_max = M*ntop\n",
+    "\n",
+    "    indptr = np.zeros(M+1, dtype=idx_dtype)\n",
+    "    indices = np.zeros(nnz_max, dtype=idx_dtype)\n",
+    "    data = np.zeros(nnz_max, dtype=A.dtype)\n",
+    "\n",
+    "    ct.sparse_dot_topn(\n",
+    "        M, N, np.asarray(A.indptr, dtype=idx_dtype),\n",
+    "        np.asarray(A.indices, dtype=idx_dtype),\n",
+    "        A.data,\n",
+    "        np.asarray(B.indptr, dtype=idx_dtype),\n",
+    "        np.asarray(B.indices, dtype=idx_dtype),\n",
+    "        B.data,\n",
+    "        ntop,\n",
+    "        lower_bound,\n",
+    "        indptr, indices, data)\n",
+    "\n",
+    "    return csr_matrix((data,indices,indptr),shape=(M,N))\n",
+    "\n",
+    "# unpacks the resulting sparse matrix\n",
+    "\n",
+    "def get_matches_df(sparse_matrix, name_vector, top=None):\n",
+    "    non_zeros = sparse_matrix.nonzero()\n",
+    "\n",
+    "    sparserows = non_zeros[0]\n",
+    "    sparsecols = non_zeros[1]\n",
+    "\n",
+    "    if top:\n",
+    "        nr_matches = top\n",
+    "    else:\n",
+    "        nr_matches = sparsecols.size\n",
+    "\n",
+    "    left_side = np.empty([nr_matches], dtype=object)\n",
+    "    right_side = np.empty([nr_matches], dtype=object)\n",
+    "    similarity = np.zeros(nr_matches)\n",
+    "\n",
+    "    for index in range(0, nr_matches):\n",
+    "        left_side[index] = name_vector[sparserows[index]]\n",
+    "        right_side[index] = name_vector[sparsecols[index]]\n",
+    "        similarity[index] = sparse_matrix.data[index]\n",
+    "\n",
+    "    return pd.DataFrame({'left_side': left_side,\n",
+    "                          'right_side': right_side,\n",
+    "                           'similarity': similarity})\n",
+    "\n",
+    "\n",
+    "def discrepancy_filter(df):\n",
+    "    f_df = df.copy()\n",
+    "    tokenlist = [\"Med\", \"Hosp\", \"Tech\", \"Univ\", \"Acad\", \"Poly\"]\n",
+    "    for token in tokenlist:\n",
+    "        f_df = f_df[~(((f_df[\"right_side\"].str.contains(token))&\n",
+    "                       (~f_df[\"left_side\"].str.contains(token)))\n",
+    "                      |\n",
+    "                ((f_df[\"left_side\"].str.contains(token))&\n",
+    "                 (~f_df[\"right_side\"].str.contains(token))))].copy()\n",
+    "    return f_df\n",
+    "\n",
+    "\n",
+    "# Define a function to get the high and low counts for each row\n",
+    "def get_high_low_counts(row):\n",
+    "    if row['left_count'] > row['right_count']:\n",
+    "        high_count = row['left_count']\n",
+    "        low_count = row['right_count']\n",
+    "    else: #row['left_count'] < row['right_count']:\n",
+    "        high_count = row['right_count']\n",
+    "        low_count = row['left_count']\n",
+    "    # else:\n",
+    "    #     if len(row['left_side']) > len(row['right_side']):\n",
+    "    #             high_count = len(row['left_side'])\n",
+    "    #             low_count = len(row['right_side'])\n",
+    "    #     else:\n",
+    "    #             high_count = len(row['right_side'])\n",
+    "    #             low_count = len(row['left_side'])\n",
+    "    return pd.Series([high_count, low_count])"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 130,
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "30it [00:00, 53.27it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "merger = pd.DataFrame()\n",
+    "\n",
+    "for i in tqdm(filter(lambda c: c!=\"China\", list(univ_norm[\"Country\"].unique()))):\n",
+    "    sub_inst = univ_norm[univ_norm[\"Country\"]==i].reset_index()\n",
+    "    types = sub_inst['Institution_harm']\n",
+    "    vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)\n",
+    "    tf_idf_matrix = vectorizer.fit_transform(types)\n",
+    "    t1 = time.time()\n",
+    "    matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.8)\n",
+    "    t = time.time()-t1\n",
+    "\n",
+    "    # store the  matches into new dataframe called matched_df and printing 10 samples\n",
+    "    matches_df = get_matches_df(matches, types)\n",
+    "    matches_df = matches_df[matches_df['similarity'] < 0.99999] # For removing all exact matches\n",
+    "    matches_df = discrepancy_filter(matches_df).reset_index(drop=True)\n",
+    "    matches_df[\"Country\"] = i\n",
+    "    # matches_df = matches_df[pd.DataFrame(np.sort(matches_df[['left_side','right_side']].values,1)).duplicated()]\n",
+    "    # matches_df = matches_df[~matches_df[['left_side', 'right_side']].apply(frozenset, axis=1).duplicated()]\n",
+    "    merger = pd.concat([merger,matches_df], ignore_index=True)\n",
+    "\n",
+    "for s in [\"left\",\"right\"]:\n",
+    "    merger[f\"{s}_count\"] = merger[f\"{s}_side\"].apply(lambda x: len(univ[univ[\"Institution_harm\"] == x]))\n",
+    "\n",
+    "# Apply the function to create a new column\n",
+    "merger[['high_count', 'low_count']] = merger.apply(get_high_low_counts, axis=1)\n",
+    "\n",
+    "# Use apply again to create the high_side and low_side columns\n",
+    "merger['high_side'] = merger.apply(lambda row: row['left_side'] if row['left_count'] > row['right_count'] else row['right_side'], axis=1)\n",
+    "merger['low_side'] = merger.apply(lambda row: row['left_side'] if row['left_count'] <= row['right_count'] else row['right_side'], axis=1)\n",
+    "\n",
+    "# Drop the high_count and low_count columns if they are not needed\n",
+    "# merger.drop(['high_count', 'low_count'], axis=1, inplace=True)"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 131,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1192\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "1192it [00:44, 26.68it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "fuzzymerger = merger[[\"Country\",\"low_side\",\"high_side\",\"high_count\",\"low_count\",\"similarity\"]].drop_duplicates()\n",
+    "fuzzymerger = fuzzymerger.sort_values(by=[\"low_side\",\"high_count\"], ascending=[True,False])\n",
+    "fuzzymerger = fuzzymerger.drop_duplicates(subset=[\"Country\",\"low_side\"]).sort_values(by=\"high_count\", ascending=True).reset_index(drop=True)\n",
+    "print(len(fuzzymerger))\n",
+    "univ_harm = univ.copy()\n",
+    "univ_harm[\"merge_iter\"] = 0\n",
+    "for i,row in tqdm(fuzzymerger.iterrows()):\n",
+    "    univ_harm.loc[((univ_harm[\"Country\"]==row[\"Country\"])&\n",
+    "                   (univ_harm[\"Institution_harm\"]==row[\"low_side\"])),\"merge_iter\"] += 1\n",
+    "    univ_harm.loc[((univ_harm[\"Country\"]==row[\"Country\"])&\n",
+    "                   (univ_harm[\"Institution_harm\"]==row[\"low_side\"])),\"Institution_harm\"] = row[\"high_side\"]"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 120,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "         UT (Unique WOS ID)                           Institution Country   \n244     WOS:000286472300003                            Univ Trent   Italy  \\\n364     WOS:000287586100011                            Univ Trent   Italy   \n410     WOS:000287939200011      Abdus Salam Int Ctr Theoret Phys   Italy   \n765     WOS:000290996200002                            Univ Trent   Italy   \n907     WOS:000291698400013                       INFN Sez Roma 1   Italy   \n...                     ...                                   ...     ...   \n153063  WOS:000900129900175        Univ Rome Campus Biomed Aquila   Italy   \n154775  WOS:000929737300001                    Prevent & Res Inst   Italy   \n154813  WOS:000929737300001                       Ist Super Sanit   Italy   \n154855  WOS:000933331200004                       Univ Federio II   Italy   \n154857  WOS:000933331200004  INAF Osservatorio Astron Capodimonte   Italy   \n\n                         Institution_harm  merge_iter  \n244                           Univ Trento           1  \n364                           Univ Trento           1  \n410     Abdus Salaam Int Ctr Theoret Phys           1  \n765                           Univ Trento           1  \n907                              Sez Roma           1  \n...                                   ...         ...  \n153063    Univ Rome Campus Biomed LAquila           1  \n154775                 Prevent & Res Inst           2  \n154813                   Ist Super Sanita           1  \n154855                       Univ Federio           2  \n154857          Osserv Astron Capodimonte           1  \n\n[375 rows x 5 columns]",
+      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>UT (Unique WOS ID)</th>\n      <th>Institution</th>\n      <th>Country</th>\n      <th>Institution_harm</th>\n      <th>merge_iter</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>244</th>\n      <td>WOS:000286472300003</td>\n      <td>Univ Trent</td>\n      <td>Italy</td>\n      <td>Univ Trento</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>364</th>\n      <td>WOS:000287586100011</td>\n      <td>Univ Trent</td>\n      <td>Italy</td>\n      <td>Univ Trento</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>410</th>\n      <td>WOS:000287939200011</td>\n      <td>Abdus Salam Int Ctr Theoret Phys</td>\n      <td>Italy</td>\n      <td>Abdus Salaam Int Ctr Theoret Phys</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>765</th>\n      <td>WOS:000290996200002</td>\n      <td>Univ Trent</td>\n      <td>Italy</td>\n      <td>Univ Trento</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>907</th>\n      <td>WOS:000291698400013</td>\n      <td>INFN Sez Roma 1</td>\n      <td>Italy</td>\n      <td>Sez Roma</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>153063</th>\n      <td>WOS:000900129900175</td>\n      <td>Univ Rome Campus Biomed Aquila</td>\n      <td>Italy</td>\n      <td>Univ Rome Campus Biomed LAquila</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>154775</th>\n      <td>WOS:000929737300001</td>\n      <td>Prevent &amp; Res Inst</td>\n      <td>Italy</td>\n      <td>Prevent &amp; Res Inst</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <th>154813</th>\n      <td>WOS:000929737300001</td>\n      <td>Ist Super Sanit</td>\n      <td>Italy</td>\n      <td>Ist Super Sanita</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>154855</th>\n      <td>WOS:000933331200004</td>\n      <td>Univ Federio II</td>\n      <td>Italy</td>\n      <td>Univ Federio</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <th>154857</th>\n      <td>WOS:000933331200004</td>\n      <td>INAF Osservatorio Astron Capodimonte</td>\n      <td>Italy</td>\n      <td>Osserv Astron Capodimonte</td>\n      <td>1</td>\n    </tr>\n  </tbody>\n</table>\n<p>375 rows × 5 columns</p>\n</div>"
+     },
+     "execution_count": 120,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "univ_harm[((univ_harm[\"merge_iter\"]>0) & (univ_harm[\"Country\"]==\"Italy\"))]"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}