You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
blabla/WOS/fuzzy_identifier.ipynb

347 lines
21 KiB
Plaintext

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Importing libraries and module and some setting for notebook\n",
"\n",
"import pandas as pd\n",
"import re\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"import numpy as np\n",
"from scipy.sparse import csr_matrix\n",
"import sparse_dot_topn.sparse_dot_topn as ct #Cosine Similarity\n",
"import time"
]
},
{
"cell_type": "code",
"execution_count": 33,
"outputs": [],
"source": [
"outfile='wos_extract_complete.csv'\n",
"record_col=\"UT (Unique WOS ID)\"\n",
"wos = pd.read_csv(outfile, sep=\"\\t\",low_memory=False)\n",
"\n",
"affiliations = wos.groupby(record_col)[\"Affiliations\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
"# affiliations[affiliations[\"Affiliations\"].str.lower().str.contains(\"chinese academy\", na=False, regex=True)][\"Affiliations\"].value_counts()\n",
"affiliations[\"Affiliations\"] = affiliations[\"Affiliations\"].str.strip().str.upper()\n",
"affiliations = affiliations.drop_duplicates()\n",
"\n",
"df = affiliations[\"Affiliations\"].fillna(\"UNKNOWN\").to_frame().reset_index().drop(columns=\"index\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 25,
"outputs": [
{
"data": {
"text/plain": "pandas.core.frame.DataFrame"
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(df)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 15,
"outputs": [
{
"data": {
"text/plain": "Affiliations\nCHINESE ACADEMY OF SCIENCES 1237\nUDICE-FRENCH RESEARCH UNIVERSITIES 664\nCENTRE NATIONAL DE LA RECHERCHE SCIENTIFIQUE (CNRS) 658\nHELMHOLTZ ASSOCIATION 432\nTSINGHUA UNIVERSITY 429\n ... \nUNIVERSIDAD PONTIFICIA BOLIVARIANA 1\nCALIFORNIA STATE UNIVERSITY CHICO 1\nKOREA ENVIRONMENT INSTITUTE (KEI) 1\nNATIONAL INSTITUTE OF TECHNOLOGY MEGHALAYA 1\nSAINT JAMES'S UNIVERSITY HOSPITAL 1\nName: count, Length: 4906, dtype: int64"
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 9,
"outputs": [
{
"data": {
"text/plain": "Affiliations\nCHINESE ACADEMY OF SCIENCES 1237\nUNIVERSITY OF CHINESE ACADEMY OF SCIENCES, CAS 420\nCHINESE ACADEMY OF AGRICULTURAL SCIENCES 50\nCHINESE ACADEMY OF MEDICAL SCIENCES - PEKING UNION MEDICAL COLLEGE 50\nCHINESE ACADEMY OF ENGINEERING PHYSICS 11\nCHINESE ACADEMY OF FORESTRY 9\nCHINESE ACADEMY OF SURVEYING & MAPPING 8\nCHINESE ACADEMY OF SOCIAL SCIENCES 7\nCHINESE ACADEMY OF METEOROLOGICAL SCIENCES (CAMS) 7\nCHINESE ACADEMY OF TROPICAL AGRICULTURAL SCIENCES 3\nCHINESE ACADEMY OF GEOLOGICAL SCIENCES 3\nCHINESE ACADEMY OF FISHERY SCIENCES 2\nUNIVERSITY OF CHINESE ACADEMY OF SOCIAL SCIENCES 1\nName: count, dtype: int64"
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df.str.contains(\"CHINESE ACADEMY\",na=False)].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 35,
"outputs": [
{
"ename": "TypeError",
"evalue": "expected string or bytes-like object",
"output_type": "error",
"traceback": [
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[1;31mTypeError\u001B[0m Traceback (most recent call last)",
"Cell \u001B[1;32mIn[35], line 18\u001B[0m\n\u001B[0;32m 10\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m [\u001B[38;5;124m'\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;241m.\u001B[39mjoin(ngram) \u001B[38;5;28;01mfor\u001B[39;00m ngram \u001B[38;5;129;01min\u001B[39;00m ngrams]\n\u001B[0;32m 16\u001B[0m \u001B[38;5;66;03m# Testing ngrams work for verification\u001B[39;00m\n\u001B[1;32m---> 18\u001B[0m \u001B[43mngrams\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdf\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43miloc\u001B[49m\u001B[43m[\u001B[49m\u001B[38;5;241;43m0\u001B[39;49m\u001B[43m]\u001B[49m\u001B[43m)\u001B[49m\n",
"Cell \u001B[1;32mIn[35], line 8\u001B[0m, in \u001B[0;36mngrams\u001B[1;34m(string, n)\u001B[0m\n\u001B[0;32m 6\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mngrams\u001B[39m(string, n\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m3\u001B[39m):\n\u001B[1;32m----> 8\u001B[0m string \u001B[38;5;241m=\u001B[39m \u001B[43mre\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43msub\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;124;43mr\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43m[,-./]|\u001B[39;49m\u001B[38;5;124;43m\\\u001B[39;49m\u001B[38;5;124;43msBD\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m,\u001B[49m\u001B[38;5;124;43mr\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mstring\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 9\u001B[0m ngrams \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mzip\u001B[39m(\u001B[38;5;241m*\u001B[39m[string[i:] \u001B[38;5;28;01mfor\u001B[39;00m i \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mrange\u001B[39m(n)])\n\u001B[0;32m 10\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m [\u001B[38;5;124m'\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;241m.\u001B[39mjoin(ngram) \u001B[38;5;28;01mfor\u001B[39;00m ngram \u001B[38;5;129;01min\u001B[39;00m ngrams]\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\re.py:210\u001B[0m, in \u001B[0;36msub\u001B[1;34m(pattern, repl, string, count, flags)\u001B[0m\n\u001B[0;32m 203\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21msub\u001B[39m(pattern, repl, string, count\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m0\u001B[39m, flags\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m0\u001B[39m):\n\u001B[0;32m 204\u001B[0m \u001B[38;5;250m \u001B[39m\u001B[38;5;124;03m\"\"\"Return the string obtained by replacing the leftmost\u001B[39;00m\n\u001B[0;32m 205\u001B[0m \u001B[38;5;124;03m non-overlapping occurrences of the pattern in string by the\u001B[39;00m\n\u001B[0;32m 206\u001B[0m \u001B[38;5;124;03m replacement repl. repl can be either a string or a callable;\u001B[39;00m\n\u001B[0;32m 207\u001B[0m \u001B[38;5;124;03m if a string, backslash escapes in it are processed. If it is\u001B[39;00m\n\u001B[0;32m 208\u001B[0m \u001B[38;5;124;03m a callable, it's passed the Match object and must return\u001B[39;00m\n\u001B[0;32m 209\u001B[0m \u001B[38;5;124;03m a replacement string to be used.\"\"\"\u001B[39;00m\n\u001B[1;32m--> 210\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43m_compile\u001B[49m\u001B[43m(\u001B[49m\u001B[43mpattern\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mflags\u001B[49m\u001B[43m)\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43msub\u001B[49m\u001B[43m(\u001B[49m\u001B[43mrepl\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mstring\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mcount\u001B[49m\u001B[43m)\u001B[49m\n",
"\u001B[1;31mTypeError\u001B[0m: expected string or bytes-like object"
]
}
],
"source": [
"# ngrams(here we are taking n = 3 thus 3-gram (trigrams ) as most room types only contain two or three words\n",
"# used for cleaning and removing some punctuation (dots, commas etc) i.e.((,-./)) from a string\n",
"# and generate and collect all n-grams of the string.\n",
"\n",
"\n",
"def ngrams(string, n=3):\n",
"\n",
" string = re.sub(r'[,-./]|\\sBD',r'', string)\n",
" ngrams = zip(*[string[i:] for i in range(n)])\n",
" return [''.join(ngram) for ngram in ngrams]\n",
"\n",
"\n"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 36,
"outputs": [],
"source": [
"# After having each words split (token or lemmas (n-gram generated items) ) into a vector and\n",
"# Scikit-learns Tfidfvectorizer aim to do the same thing, which is to convert a collection of raw documents to a matrix of TF-IDF features.\n",
"# Generate the matrix of TF-IDF (Term Frequency-Inverse Document frequency)values for each\n",
"\n",
"\n",
"types = df['Affiliations']\n",
"vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)\n",
"tf_idf_matrix = vectorizer.fit_transform(types)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 27,
"outputs": [
{
"data": {
"text/plain": "<65153x6417 sparse matrix of type '<class 'numpy.float64'>'\n\twith 1752829 stored elements in Compressed Sparse Row format>"
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tf_idf_matrix"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 37,
"outputs": [],
"source": [
"# calculate the similarity between two vectors of TF-IDF values the Cosine Similarity is usually used.\n",
"# result matrix in a very sparse terms and Scikit-learn deals with this nicely by returning a sparse CSR matrix.\n",
"\n",
"def awesome_cossim_top(A, B, ntop, lower_bound=0):\n",
" # force A and B as a CSR matrix.\n",
" # If they have already been CSR, there is no overhead\n",
" A = A.tocsr()\n",
" B = B.tocsr()\n",
" M, _ = A.shape\n",
" _, N = B.shape\n",
"\n",
" idx_dtype = np.int32\n",
"\n",
" nnz_max = M*ntop\n",
"\n",
" indptr = np.zeros(M+1, dtype=idx_dtype)\n",
" indices = np.zeros(nnz_max, dtype=idx_dtype)\n",
" data = np.zeros(nnz_max, dtype=A.dtype)\n",
"\n",
" ct.sparse_dot_topn(\n",
" M, N, np.asarray(A.indptr, dtype=idx_dtype),\n",
" np.asarray(A.indices, dtype=idx_dtype),\n",
" A.data,\n",
" np.asarray(B.indptr, dtype=idx_dtype),\n",
" np.asarray(B.indices, dtype=idx_dtype),\n",
" B.data,\n",
" ntop,\n",
" lower_bound,\n",
" indptr, indices, data)\n",
"\n",
" return csr_matrix((data,indices,indptr),shape=(M,N))\n"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 38,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"SELFTIMED: 47.96097183227539\n"
]
}
],
"source": [
"# Run the optimized cosine similarity function.\n",
"# Only stores the top 10 most similar items with a similarity above 0.8\n",
"\n",
"t1 = time.time()\n",
"matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.8)\n",
"t = time.time()-t1\n",
"print(\"SELFTIMED:\", t)\n"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 29,
"outputs": [],
"source": [
"# unpacks the resulting sparse matrix\n",
"\n",
"def get_matches_df(sparse_matrix, name_vector, top=100):\n",
" non_zeros = sparse_matrix.nonzero()\n",
"\n",
" sparserows = non_zeros[0]\n",
" sparsecols = non_zeros[1]\n",
"\n",
" if top:\n",
" nr_matches = top\n",
" else:\n",
" nr_matches = sparsecols.size\n",
"\n",
" left_side = np.empty([nr_matches], dtype=object)\n",
" right_side = np.empty([nr_matches], dtype=object)\n",
" similairity = np.zeros(nr_matches)\n",
"\n",
" for index in range(0, nr_matches):\n",
" left_side[index] = name_vector[sparserows[index]]\n",
" right_side[index] = name_vector[sparsecols[index]]\n",
" similairity[index] = sparse_matrix.data[index]\n",
"\n",
" return pd.DataFrame({'left_side': left_side,\n",
" 'right_side': right_side,\n",
" 'similarity': similairity})"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 51,
"outputs": [
{
"data": {
"text/plain": " left_side right_side \n3274 LOUISIANA STATE UNIVERSITY LOUISIANA STATE UNIVERSITY SYSTEM \\\n7376 SHAOYANG UNIVERSITY CHAOYANG UNIVERSITY OF TECHNOLOGY \n7377 SHAOYANG UNIVERSITY CHAOYANG UNIVERSITY OF TECHNOLOGY \n7378 SHAOYANG UNIVERSITY CHAOYANG UNIVERSITY OF TECHNOLOGY \n7379 SHAOYANG UNIVERSITY CHAOYANG UNIVERSITY OF TECHNOLOGY \n... ... ... \n599251 CHANGZHOU UNIVERSITY YANGZHOU UNIVERSITY \n599252 CHANGZHOU UNIVERSITY YANGZHOU UNIVERSITY \n599253 CHANGZHOU UNIVERSITY YANGZHOU UNIVERSITY \n599254 CHANGZHOU UNIVERSITY YANGZHOU UNIVERSITY \n599255 CHANGZHOU UNIVERSITY YANGZHOU UNIVERSITY \n\n similarity \n3274 0.883792 \n7376 0.807530 \n7377 0.807530 \n7378 0.807530 \n7379 0.807530 \n... ... \n599251 0.846442 \n599252 0.846442 \n599253 0.846442 \n599254 0.846442 \n599255 0.846442 \n\n[2192 rows x 3 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>left_side</th>\n <th>right_side</th>\n <th>similarity</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>3274</th>\n <td>LOUISIANA STATE UNIVERSITY</td>\n <td>LOUISIANA STATE UNIVERSITY SYSTEM</td>\n <td>0.883792</td>\n </tr>\n <tr>\n <th>7376</th>\n <td>SHAOYANG UNIVERSITY</td>\n <td>CHAOYANG UNIVERSITY OF TECHNOLOGY</td>\n <td>0.807530</td>\n </tr>\n <tr>\n <th>7377</th>\n <td>SHAOYANG UNIVERSITY</td>\n <td>CHAOYANG UNIVERSITY OF TECHNOLOGY</td>\n <td>0.807530</td>\n </tr>\n <tr>\n <th>7378</th>\n <td>SHAOYANG UNIVERSITY</td>\n <td>CHAOYANG UNIVERSITY OF TECHNOLOGY</td>\n <td>0.807530</td>\n </tr>\n <tr>\n <th>7379</th>\n <td>SHAOYANG UNIVERSITY</td>\n <td>CHAOYANG UNIVERSITY OF TECHNOLOGY</td>\n <td>0.807530</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>599251</th>\n <td>CHANGZHOU UNIVERSITY</td>\n <td>YANGZHOU UNIVERSITY</td>\n <td>0.846442</td>\n </tr>\n <tr>\n <th>599252</th>\n <td>CHANGZHOU UNIVERSITY</td>\n <td>YANGZHOU UNIVERSITY</td>\n <td>0.846442</td>\n </tr>\n <tr>\n <th>599253</th>\n <td>CHANGZHOU UNIVERSITY</td>\n <td>YANGZHOU UNIVERSITY</td>\n <td>0.846442</td>\n </tr>\n <tr>\n <th>599254</th>\n <td>CHANGZHOU UNIVERSITY</td>\n <td>YANGZHOU UNIVERSITY</td>\n <td>0.846442</td>\n </tr>\n <tr>\n <th>599255</th>\n <td>CHANGZHOU UNIVERSITY</td>\n <td>YANGZHOU UNIVERSITY</td>\n <td>0.846442</td>\n </tr>\n </tbody>\n</table>\n<p>2192 rows × 3 columns</p>\n</div>"
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"\n",
"# store the matches into new dataframe called matched_df and printing 10 samples\n",
"\n",
"matches_df = get_matches_df(matches, types, top=599478)\n",
"matches_df = matches_df[matches_df['similarity'] < 0.99999] # For removing all exact matches\n",
"matches_df\n",
"\n"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 42,
"outputs": [
{
"data": {
"text/plain": " left_side \n0 NATURAL HISTORY MUSEUM LONDON \\\n1 NATURAL HISTORY MUSEUM LONDON \n2 NATURAL HISTORY MUSEUM LONDON \n3 NATURAL HISTORY MUSEUM LONDON \n4 BULGARIAN ACADEMY OF SCIENCES \n.. ... \n195 DELFT UNIVERSITY OF TECHNOLOGY \n196 DELFT UNIVERSITY OF TECHNOLOGY \n197 NATIONAL UNIVERSITY OF DEFENSE TECHNOLOGY - CHINA \n198 NATIONAL UNIVERSITY OF DEFENSE TECHNOLOGY - CHINA \n199 NATIONAL UNIVERSITY OF DEFENSE TECHNOLOGY - CHINA \n\n right_side similarity \n0 NATURAL HISTORY MUSEUM LONDON 1.0 \n1 NATURAL HISTORY MUSEUM LONDON 1.0 \n2 NATURAL HISTORY MUSEUM LONDON 1.0 \n3 NATURAL HISTORY MUSEUM LONDON 1.0 \n4 BULGARIAN ACADEMY OF SCIENCES 1.0 \n.. ... ... \n195 DELFT UNIVERSITY OF TECHNOLOGY 1.0 \n196 DELFT UNIVERSITY OF TECHNOLOGY 1.0 \n197 NATIONAL UNIVERSITY OF DEFENSE TECHNOLOGY - CHINA 1.0 \n198 NATIONAL UNIVERSITY OF DEFENSE TECHNOLOGY - CHINA 1.0 \n199 NATIONAL UNIVERSITY OF DEFENSE TECHNOLOGY - CHINA 1.0 \n\n[200 rows x 3 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>left_side</th>\n <th>right_side</th>\n <th>similarity</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>NATURAL HISTORY MUSEUM LONDON</td>\n <td>NATURAL HISTORY MUSEUM LONDON</td>\n <td>1.0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>NATURAL HISTORY MUSEUM LONDON</td>\n <td>NATURAL HISTORY MUSEUM LONDON</td>\n <td>1.0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>NATURAL HISTORY MUSEUM LONDON</td>\n <td>NATURAL HISTORY MUSEUM LONDON</td>\n <td>1.0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>NATURAL HISTORY MUSEUM LONDON</td>\n <td>NATURAL HISTORY MUSEUM LONDON</td>\n <td>1.0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>BULGARIAN ACADEMY OF SCIENCES</td>\n <td>BULGARIAN ACADEMY OF SCIENCES</td>\n <td>1.0</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>195</th>\n <td>DELFT UNIVERSITY OF TECHNOLOGY</td>\n <td>DELFT UNIVERSITY OF TECHNOLOGY</td>\n <td>1.0</td>\n </tr>\n <tr>\n <th>196</th>\n <td>DELFT UNIVERSITY OF TECHNOLOGY</td>\n <td>DELFT UNIVERSITY OF TECHNOLOGY</td>\n <td>1.0</td>\n </tr>\n <tr>\n <th>197</th>\n <td>NATIONAL UNIVERSITY OF DEFENSE TECHNOLOGY - CHINA</td>\n <td>NATIONAL UNIVERSITY OF DEFENSE TECHNOLOGY - CHINA</td>\n <td>1.0</td>\n </tr>\n <tr>\n <th>198</th>\n <td>NATIONAL UNIVERSITY OF DEFENSE TECHNOLOGY - CHINA</td>\n <td>NATIONAL UNIVERSITY OF DEFENSE TECHNOLOGY - CHINA</td>\n <td>1.0</td>\n </tr>\n <tr>\n <th>199</th>\n <td>NATIONAL UNIVERSITY OF DEFENSE TECHNOLOGY - CHINA</td>\n <td>NATIONAL UNIVERSITY OF DEFENSE TECHNOLOGY - CHINA</td>\n <td>1.0</td>\n </tr>\n </tbody>\n</table>\n<p>200 rows × 3 columns</p>\n</div>"
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}