blabla/WOS/fuzzy_identifier.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#  Importing libraries and module and some setting for notebook\n",
    "\n",
    "import pandas as pd\n",
    "import re\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "import numpy as np\n",
    "from scipy.sparse import csr_matrix\n",
    "import sparse_dot_topn.sparse_dot_topn as ct  #Cosine Similarity\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "outputs": [],
   "source": [
    "outfile='wos_extract_complete.csv'\n",
    "record_col=\"UT (Unique WOS ID)\"\n",
    "wos = pd.read_csv(outfile, sep=\"\\t\",low_memory=False)\n",
    "\n",
    "affiliations = wos.groupby(record_col)[\"Affiliations\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
    "# affiliations[affiliations[\"Affiliations\"].str.lower().str.contains(\"chinese academy\", na=False, regex=True)][\"Affiliations\"].value_counts()\n",
    "affiliations[\"Affiliations\"] = affiliations[\"Affiliations\"].str.strip().str.upper()\n",
    "affiliations = affiliations.drop_duplicates()\n",
    "\n",
    "df = affiliations[\"Affiliations\"].fillna(\"UNKNOWN\").to_frame().reset_index().drop(columns=\"index\")"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "outputs": [
    {
     "data": {
      "text/plain": "pandas.core.frame.DataFrame"
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(df)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "outputs": [
    {
     "data": {
      "text/plain": "Affiliations\nCHINESE ACADEMY OF SCIENCES                            1237\nUDICE-FRENCH RESEARCH UNIVERSITIES                      664\nCENTRE NATIONAL DE LA RECHERCHE SCIENTIFIQUE (CNRS)     658\nHELMHOLTZ ASSOCIATION                                   432\nTSINGHUA UNIVERSITY                                     429\n                                                       ... \nUNIVERSIDAD PONTIFICIA BOLIVARIANA                        1\nCALIFORNIA STATE UNIVERSITY CHICO                         1\nKOREA ENVIRONMENT INSTITUTE (KEI)                         1\nNATIONAL INSTITUTE OF TECHNOLOGY MEGHALAYA                1\nSAINT JAMES'S UNIVERSITY HOSPITAL                         1\nName: count, Length: 4906, dtype: int64"
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.value_counts()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "outputs": [
    {
     "data": {
      "text/plain": "Affiliations\nCHINESE ACADEMY OF SCIENCES                                           1237\nUNIVERSITY OF CHINESE ACADEMY OF SCIENCES, CAS                         420\nCHINESE ACADEMY OF AGRICULTURAL SCIENCES                                50\nCHINESE ACADEMY OF MEDICAL SCIENCES - PEKING UNION MEDICAL COLLEGE      50\nCHINESE ACADEMY OF ENGINEERING PHYSICS                                  11\nCHINESE ACADEMY OF FORESTRY                                              9\nCHINESE ACADEMY OF SURVEYING & MAPPING                                   8\nCHINESE ACADEMY OF SOCIAL SCIENCES                                       7\nCHINESE ACADEMY OF METEOROLOGICAL SCIENCES (CAMS)                        7\nCHINESE ACADEMY OF TROPICAL AGRICULTURAL SCIENCES                        3\nCHINESE ACADEMY OF GEOLOGICAL SCIENCES                                   3\nCHINESE ACADEMY OF FISHERY SCIENCES                                      2\nUNIVERSITY OF CHINESE ACADEMY OF SOCIAL SCIENCES                         1\nName: count, dtype: int64"
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df.str.contains(\"CHINESE ACADEMY\",na=False)].value_counts()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "outputs": [
    {
     "ename": "TypeError",
     "evalue": "expected string or bytes-like object",
     "output_type": "error",
     "traceback": [
      "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
      "\u001B[1;31mTypeError\u001B[0m                                 Traceback (most recent call last)",
      "Cell \u001B[1;32mIn[35], line 18\u001B[0m\n\u001B[0;32m     10\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m [\u001B[38;5;124m'\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;241m.\u001B[39mjoin(ngram) \u001B[38;5;28;01mfor\u001B[39;00m ngram \u001B[38;5;129;01min\u001B[39;00m ngrams]\n\u001B[0;32m     16\u001B[0m \u001B[38;5;66;03m# Testing ngrams work for verification\u001B[39;00m\n\u001B[1;32m---> 18\u001B[0m \u001B[43mngrams\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdf\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43miloc\u001B[49m\u001B[43m[\u001B[49m\u001B[38;5;241;43m0\u001B[39;49m\u001B[43m]\u001B[49m\u001B[43m)\u001B[49m\n",
      "Cell \u001B[1;32mIn[35], line 8\u001B[0m, in \u001B[0;36mngrams\u001B[1;34m(string, n)\u001B[0m\n\u001B[0;32m      6\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mngrams\u001B[39m(string, n\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m3\u001B[39m):\n\u001B[1;32m----> 8\u001B[0m     string \u001B[38;5;241m=\u001B[39m \u001B[43mre\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43msub\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;124;43mr\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43m[,-./]|\u001B[39;49m\u001B[38;5;124;43m\\\u001B[39;49m\u001B[38;5;124;43msBD\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m,\u001B[49m\u001B[38;5;124;43mr\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mstring\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m      9\u001B[0m     ngrams \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mzip\u001B[39m(\u001B[38;5;241m*\u001B[39m[string[i:] \u001B[38;5;28;01mfor\u001B[39;00m i \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mrange\u001B[39m(n)])\n\u001B[0;32m     10\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m [\u001B[38;5;124m'\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;241m.\u001B[39mjoin(ngram) \u001B[38;5;28;01mfor\u001B[39;00m ngram \u001B[38;5;129;01min\u001B[39;00m ngrams]\n",
      "File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\re.py:210\u001B[0m, in \u001B[0;36msub\u001B[1;34m(pattern, repl, string, count, flags)\u001B[0m\n\u001B[0;32m    203\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21msub\u001B[39m(pattern, repl, string, count\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m0\u001B[39m, flags\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m0\u001B[39m):\n\u001B[0;32m    204\u001B[0m \u001B[38;5;250m    \u001B[39m\u001B[38;5;124;03m\"\"\"Return the string obtained by replacing the leftmost\u001B[39;00m\n\u001B[0;32m    205\u001B[0m \u001B[38;5;124;03m    non-overlapping occurrences of the pattern in string by the\u001B[39;00m\n\u001B[0;32m    206\u001B[0m \u001B[38;5;124;03m    replacement repl.  repl can be either a string or a callable;\u001B[39;00m\n\u001B[0;32m    207\u001B[0m \u001B[38;5;124;03m    if a string, backslash escapes in it are processed.  If it is\u001B[39;00m\n\u001B[0;32m    208\u001B[0m \u001B[38;5;124;03m    a callable, it's passed the Match object and must return\u001B[39;00m\n\u001B[0;32m    209\u001B[0m \u001B[38;5;124;03m    a replacement string to be used.\"\"\"\u001B[39;00m\n\u001B[1;32m--> 210\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43m_compile\u001B[49m\u001B[43m(\u001B[49m\u001B[43mpattern\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mflags\u001B[49m\u001B[43m)\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43msub\u001B[49m\u001B[43m(\u001B[49m\u001B[43mrepl\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mstring\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mcount\u001B[49m\u001B[43m)\u001B[49m\n",
      "\u001B[1;31mTypeError\u001B[0m: expected string or bytes-like object"
     ]
    }
   ],
   "source": [
    "#  ngrams(here we are taking n = 3 thus 3-gram (trigrams ) as  most room types only contain two or three words\n",
    "#  used for cleaning and removing some punctuation (dots, comma’s etc) i.e.((,-./)) from a string\n",
    "#  and generate and collect all n-grams of the string.\n",
    "\n",
    "\n",
    "def ngrams(string, n=3):\n",
    "\n",
    "    string = re.sub(r'[,-./]|\\sBD',r'', string)\n",
    "    ngrams = zip(*[string[i:] for i in range(n)])\n",
    "    return [''.join(ngram) for ngram in ngrams]\n",
    "\n",
    "\n"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "outputs": [],
   "source": [
    "# After having each words split (token or  lemmas (n-gram generated items) ) into a vector and\n",
    "# Scikit-learn’s  Tfidfvectorizer aim to do the same thing, which is to convert a collection of raw documents to a matrix of TF-IDF features.\n",
    "# Generate the matrix of TF-IDF (Term Frequency-Inverse Document frequency)values for each\n",
    "\n",
    "\n",
    "types = df['Affiliations']\n",
    "vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)\n",
    "tf_idf_matrix = vectorizer.fit_transform(types)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "outputs": [
    {
     "data": {
      "text/plain": "<65153x6417 sparse matrix of type '<class 'numpy.float64'>'\n\twith 1752829 stored elements in Compressed Sparse Row format>"
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tf_idf_matrix"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "outputs": [],
   "source": [
    "# calculate the similarity between two vectors of TF-IDF values the Cosine Similarity is usually used.\n",
    "# result matrix in a very sparse terms and Scikit-learn deals with this nicely by returning a sparse CSR matrix.\n",
    "\n",
    "def awesome_cossim_top(A, B, ntop, lower_bound=0):\n",
    "    # force A and B as a CSR matrix.\n",
    "    # If they have already been CSR, there is no overhead\n",
    "    A = A.tocsr()\n",
    "    B = B.tocsr()\n",
    "    M, _ = A.shape\n",
    "    _, N = B.shape\n",
    "\n",
    "    idx_dtype = np.int32\n",
    "\n",
    "    nnz_max = M*ntop\n",
    "\n",
    "    indptr = np.zeros(M+1, dtype=idx_dtype)\n",
    "    indices = np.zeros(nnz_max, dtype=idx_dtype)\n",
    "    data = np.zeros(nnz_max, dtype=A.dtype)\n",
    "\n",
    "    ct.sparse_dot_topn(\n",
    "        M, N, np.asarray(A.indptr, dtype=idx_dtype),\n",
    "        np.asarray(A.indices, dtype=idx_dtype),\n",
    "        A.data,\n",
    "        np.asarray(B.indptr, dtype=idx_dtype),\n",
    "        np.asarray(B.indices, dtype=idx_dtype),\n",
    "        B.data,\n",
    "        ntop,\n",
    "        lower_bound,\n",
    "        indptr, indices, data)\n",
    "\n",
    "    return csr_matrix((data,indices,indptr),shape=(M,N))\n"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "SELFTIMED: 47.96097183227539\n"
     ]
    }
   ],
   "source": [
    "#  Run the optimized cosine similarity function.\n",
    "#  Only stores the top 10 most similar items with a similarity above 0.8\n",
    "\n",
    "t1 = time.time()\n",
    "matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.8)\n",
    "t = time.time()-t1\n",
    "print(\"SELFTIMED:\", t)\n"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "outputs": [],
   "source": [
    "# unpacks the resulting sparse matrix\n",
    "\n",
    "def get_matches_df(sparse_matrix, name_vector, top=100):\n",
    "    non_zeros = sparse_matrix.nonzero()\n",
    "\n",
    "    sparserows = non_zeros[0]\n",
    "    sparsecols = non_zeros[1]\n",
    "\n",
    "    if top:\n",
    "        nr_matches = top\n",
    "    else:\n",
    "        nr_matches = sparsecols.size\n",
    "\n",
    "    left_side = np.empty([nr_matches], dtype=object)\n",
    "    right_side = np.empty([nr_matches], dtype=object)\n",
    "    similairity = np.zeros(nr_matches)\n",
    "\n",
    "    for index in range(0, nr_matches):\n",
    "        left_side[index] = name_vector[sparserows[index]]\n",
    "        right_side[index] = name_vector[sparsecols[index]]\n",
    "        similairity[index] = sparse_matrix.data[index]\n",
    "\n",
    "    return pd.DataFrame({'left_side': left_side,\n",
    "                          'right_side': right_side,\n",
    "                           'similarity': similairity})"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "outputs": [
    {
     "data": {
      "text/plain": "                         left_side                         right_side   \n3274    LOUISIANA STATE UNIVERSITY  LOUISIANA STATE UNIVERSITY SYSTEM  \\\n7376           SHAOYANG UNIVERSITY  CHAOYANG UNIVERSITY OF TECHNOLOGY   \n7377           SHAOYANG UNIVERSITY  CHAOYANG UNIVERSITY OF TECHNOLOGY   \n7378           SHAOYANG UNIVERSITY  CHAOYANG UNIVERSITY OF TECHNOLOGY   \n7379           SHAOYANG UNIVERSITY  CHAOYANG UNIVERSITY OF TECHNOLOGY   \n...                            ...                                ...   \n599251        CHANGZHOU UNIVERSITY                YANGZHOU UNIVERSITY   \n599252        CHANGZHOU UNIVERSITY                YANGZHOU UNIVERSITY   \n599253        CHANGZHOU UNIVERSITY                YANGZHOU UNIVERSITY   \n599254        CHANGZHOU UNIVERSITY                YANGZHOU UNIVERSITY   \n599255        CHANGZHOU UNIVERSITY                YANGZHOU UNIVERSITY   \n\n        similarity  \n3274      0.883792  \n7376      0.807530  \n7377      0.807530  \n7378      0.807530  \n7379      0.807530  \n...            ...  \n599251    0.846442  \n599252    0.846442  \n599253    0.846442  \n599254    0.846442  \n599255    0.846442  \n\n[2192 rows x 3 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>left_side</th>\n      <th>right_side</th>\n      <th>similarity</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>3274</th>\n      <td>LOUISIANA STATE UNIVERSITY</td>\n      <td>LOUISIANA STATE UNIVERSITY SYSTEM</td>\n      <td>0.883792</td>\n    </tr>\n    <tr>\n      <th>7376</th>\n      <td>SHAOYANG UNIVERSITY</td>\n      <td>CHAOYANG UNIVERSITY OF TECHNOLOGY</td>\n      <td>0.807530</td>\n    </tr>\n    <tr>\n      <th>7377</th>\n      <td>SHAOYANG UNIVERSITY</td>\n      <td>CHAOYANG UNIVERSITY OF TECHNOLOGY</td>\n      <td>0.807530</td>\n    </tr>\n    <tr>\n      <th>7378</th>\n      <td>SHAOYANG UNIVERSITY</td>\n      <td>CHAOYANG UNIVERSITY OF TECHNOLOGY</td>\n      <td>0.807530</td>\n    </tr>\n    <tr>\n      <th>7379</th>\n      <td>SHAOYANG UNIVERSITY</td>\n      <td>CHAOYANG UNIVERSITY OF TECHNOLOGY</td>\n      <td>0.807530</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>599251</th>\n      <td>CHANGZHOU UNIVERSITY</td>\n      <td>YANGZHOU UNIVERSITY</td>\n      <td>0.846442</td>\n    </tr>\n    <tr>\n      <th>599252</th>\n      <td>CHANGZHOU UNIVERSITY</td>\n      <td>YANGZHOU UNIVERSITY</td>\n      <td>0.846442</td>\n    </tr>\n    <tr>\n      <th>599253</th>\n      <td>CHANGZHOU UNIVERSITY</td>\n      <td>YANGZHOU UNIVERSITY</td>\n      <td>0.846442</td>\n    </tr>\n    <tr>\n      <th>599254</th>\n      <td>CHANGZHOU UNIVERSITY</td>\n      <td>YANGZHOU UNIVERSITY</td>\n      <td>0.846442</td>\n    </tr>\n    <tr>\n      <th>599255</th>\n      <td>CHANGZHOU UNIVERSITY</td>\n      <td>YANGZHOU UNIVERSITY</td>\n      <td>0.846442</td>\n    </tr>\n  </tbody>\n</table>\n<p>2192 rows × 3 columns</p>\n</div>"
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "\n",
    "# store the  matches into new dataframe called matched_df and printing 10 samples\n",
    "\n",
    "matches_df = get_matches_df(matches, types, top=599478)\n",
    "matches_df = matches_df[matches_df['similarity'] < 0.99999] # For removing all exact matches\n",
    "matches_df\n",
    "\n"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "outputs": [
    {
     "data": {
      "text/plain": "                                             left_side   \n0                        NATURAL HISTORY MUSEUM LONDON  \\\n1                        NATURAL HISTORY MUSEUM LONDON   \n2                        NATURAL HISTORY MUSEUM LONDON   \n3                        NATURAL HISTORY MUSEUM LONDON   \n4                        BULGARIAN ACADEMY OF SCIENCES   \n..                                                 ...   \n195                     DELFT UNIVERSITY OF TECHNOLOGY   \n196                     DELFT UNIVERSITY OF TECHNOLOGY   \n197  NATIONAL UNIVERSITY OF DEFENSE TECHNOLOGY - CHINA   \n198  NATIONAL UNIVERSITY OF DEFENSE TECHNOLOGY - CHINA   \n199  NATIONAL UNIVERSITY OF DEFENSE TECHNOLOGY - CHINA   \n\n                                            right_side  similarity  \n0                        NATURAL HISTORY MUSEUM LONDON         1.0  \n1                        NATURAL HISTORY MUSEUM LONDON         1.0  \n2                        NATURAL HISTORY MUSEUM LONDON         1.0  \n3                        NATURAL HISTORY MUSEUM LONDON         1.0  \n4                        BULGARIAN ACADEMY OF SCIENCES         1.0  \n..                                                 ...         ...  \n195                     DELFT UNIVERSITY OF TECHNOLOGY         1.0  \n196                     DELFT UNIVERSITY OF TECHNOLOGY         1.0  \n197  NATIONAL UNIVERSITY OF DEFENSE TECHNOLOGY - CHINA         1.0  \n198  NATIONAL UNIVERSITY OF DEFENSE TECHNOLOGY - CHINA         1.0  \n199  NATIONAL UNIVERSITY OF DEFENSE TECHNOLOGY - CHINA         1.0  \n\n[200 rows x 3 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>left_side</th>\n      <th>right_side</th>\n      <th>similarity</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>NATURAL HISTORY MUSEUM LONDON</td>\n      <td>NATURAL HISTORY MUSEUM LONDON</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>NATURAL HISTORY MUSEUM LONDON</td>\n      <td>NATURAL HISTORY MUSEUM LONDON</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>NATURAL HISTORY MUSEUM LONDON</td>\n      <td>NATURAL HISTORY MUSEUM LONDON</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>NATURAL HISTORY MUSEUM LONDON</td>\n      <td>NATURAL HISTORY MUSEUM LONDON</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>BULGARIAN ACADEMY OF SCIENCES</td>\n      <td>BULGARIAN ACADEMY OF SCIENCES</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>195</th>\n      <td>DELFT UNIVERSITY OF TECHNOLOGY</td>\n      <td>DELFT UNIVERSITY OF TECHNOLOGY</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>196</th>\n      <td>DELFT UNIVERSITY OF TECHNOLOGY</td>\n      <td>DELFT UNIVERSITY OF TECHNOLOGY</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>197</th>\n      <td>NATIONAL UNIVERSITY OF DEFENSE TECHNOLOGY - CHINA</td>\n      <td>NATIONAL UNIVERSITY OF DEFENSE TECHNOLOGY - CHINA</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>198</th>\n      <td>NATIONAL UNIVERSITY OF DEFENSE TECHNOLOGY - CHINA</td>\n      <td>NATIONAL UNIVERSITY OF DEFENSE TECHNOLOGY - CHINA</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>199</th>\n      <td>NATIONAL UNIVERSITY OF DEFENSE TECHNOLOGY - CHINA</td>\n      <td>NATIONAL UNIVERSITY OF DEFENSE TECHNOLOGY - CHINA</td>\n      <td>1.0</td>\n    </tr>\n  </tbody>\n</table>\n<p>200 rows × 3 columns</p>\n</div>"
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [],
   "metadata": {
    "collapsed": false
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}