You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ZSI_Reconnect_China/WOS/wos_nlp_demo.ipynb

202 lines
29 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import os\n",
"import shutil\n",
"from flashgeotext.geotext import GeoText\n",
"import re\n",
"import spacy\n",
"\n",
"\n",
"nlp_version = 'en_core_web_lg' # OR: 'en_core_web_trf'\n",
"nlp = spacy.load(nlp_version)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"outputs": [],
"source": [
"outdir=\"wos_processed_data\"\n",
"record_col=\"UT (Unique WOS ID)\""
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 10,
"outputs": [],
"source": [
"kw_df = pd.read_excel(f\"{outdir}/wos_keywords.xlsx\")\n",
"wos = pd.read_excel(f\"{outdir}/wos_processed.xlsx\")\n",
"kw_df = kw_df[~kw_df[\"keyword_all\"].isna()].copy()\n",
"wos_kwd_concat = kw_df.groupby(record_col,as_index=False).agg({'keyword_all': '; '.join})"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 11,
"outputs": [],
"source": [
"kwd_nlp = pd.DataFrame(kw_df[\"keyword_all\"].drop_duplicates())\n",
"kwd_nlp = kwd_nlp.rename(columns={\"keyword_all\":\"Document\"})\n",
"kwd_nlp[\"Type\"] = \"kw\"\n",
"kwd_nlp[record_col] = \"kw_\"+(kwd_nlp.index).astype(str)\n",
"wos_nlp = wos.merge(wos_kwd_concat, on=record_col)\n",
"# wos_nlp[\"Document\"] = wos_nlp[\"keyword_all\"].fillna(\"\").str.upper()\n",
"# wos_nlp[\"Document\"] = wos_nlp[\"Article Title\"].str.cat(wos_nlp[[\"Abstract\"]].fillna(\"\"), sep=' - ').str.upper()\n",
"wos_nlp[\"Document\"] = wos_nlp[\"Article Title\"].str.cat(wos_nlp[[\"Abstract\", \"keyword_all\"]].fillna(\"\"), sep=' - ').str.upper()\n",
"wos_nlp[[record_col, \"Document\"]].drop_duplicates()\n",
"wos_nlp[\"Type\"] = \"doc\"\n",
"\n",
"tnse_nlp = pd.concat([kwd_nlp,wos_nlp], ignore_index=True)\n",
"tnse_nlp = tnse_nlp[[record_col,\"Type\",\"Document\",\"keyword_all\"]]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [
{
"data": {
"text/plain": "<Axes: ylabel='Frequency'>"
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAk0AAAGdCAYAAAAPLEfqAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAABFwElEQVR4nO3de1zUdaL/8dcgAaPEitxWxGOtrXkJB4TQNt3Uh5W30h9edq2TuVp6TKR+u2khruL9F2oXw1J385a2mpf0qK22dvFYlu2iDJqHDmabrICCYmggCHx/fxhzmpT8guAMzPv5ePDI+Xzm+53Pe/gC7+b7ZbAYhmEgIiIiIj/Jy9ULEBEREWkMVJpERERETFBpEhERETFBpUlERETEBJUmERERERNUmkRERERMUGkSERERMUGlSURERMQElSYRERERE1SaREREREzwdvUCGrOzZy9Q33+ExmKBoKBbG2Tf7kQ5mx5PyeopOcFzsipn01NT1urxulJpugGGQYMdeA25b3einE2Pp2T1lJzgOVmVs+mp76w6PSciIiJigkqTiIiIiAkqTSIiIiImqDSJiIiImKDSJCIiImKCSpOIiIiICSpNIiIiIiaoNImIiIiYoNIkIiIiYoJKk4iIiIgJKk0iIiIiJqg0iYiIiJig0iQiIiJigrerFyBNg5eXBS8vS623a9bMdb29qsqgqspD/tS3iIjcMJUmuWFeXhZ+1rI53nUoQIGBLRpgReZUVFbx7fkSFScRETFFpUlumJeXBe9mXjy94TDHz1x09XJMuSPUn1d+G42Xl0WlSURETFFpknpz/MxFvsgtdvUyREREGoQuBBcRERExQaVJRERExASVJhERERETVJpERERETFBpEhERETFBpUlERETEBJeWpry8PCZMmEC3bt3o27cvq1evdswdO3aMESNGYLPZGDZsGEePHnXadufOnfTr1w+bzcakSZM4d+6cY84wDBYtWkSPHj2Ii4sjNTWVqqoqx3xRURGTJ08mOjqavn37sn379gbPKiIiIo2bS0vTM888Q/Pmzdm6dSvTpk3j5Zdf5m9/+xslJSWMHz+e2NhYtm7dSnR0NBMmTKCkpASAzMxMkpOTSUhIYOPGjRQXF5OUlOTY76pVq9i5cydpaWksWbKEHTt2sGrVKsd8UlISFy5cYOPGjUycOJHp06eTmZl50/OLiIhI4+GyN7f89ttvycjIYM6cOdx2223cdttt9OrVi08//ZRvv/0WX19fpk6disViITk5mf/6r/9i9+7dxMfHs27dOgYMGMDQoUMBSE1NpU+fPuTk5NC2bVvWrl1LYmIisbGxADz77LO88sorjBs3jpMnT/Lhhx/y/vvvExERQYcOHcjIyOCtt96ia9eurno6RERExM257JUmPz8/rFYrW7du5fLly5w4cYJDhw7RqVMn7HY7MTExWCxX/gCsxWKhW7duZGRkAGC32x2FCKB169aEh4djt9s5ffo0eXl53H333Y75mJgYTp06xZkzZ7Db7bRu3ZqIiAin+cOHD9+c4CIiItIoueyVJl9fX2bMmMGcOXNYu3YtlZWVxMfHM2LECN5//33uuOMOp/sHBQWRnZ0NwJkzZwgNDb1qPj8/n4KCAgCn+eDgYADH/LW2PX36dK0zfN/p6lX1Phti33JtDflce9Ln01OyekpO8Jysytn01JT1RrO79G/PffXVV/Tp04ff/e53ZGdnM2fOHO655x5KS0vx8fFxuq+Pjw/l5eUAXLp0qcb5S5cuOW7/cA6gvLz8uvuujaCgW2u9jTvsW/5XYGCLm/I4nvT59JSsnpITPCercjY99Z3VZaXp008/ZfPmzezbtw8/Pz8iIyM5ffo0r7/+Om3btr2qxJSXl+Pn5wdceZXqWvNWq9WpIPn6+jr+DWC1WmvctnrftXH27AUMo9ab/SSL5conuSH23VCaNfO6aeWjvhUVfUdlZdX171hHjfHzWVeektVTcoLnZFXOpqemrNXjdeWy0nT06FHatWvnVFY6d+7MsmXLiI2NpbCw0On+hYWFjtNqYWFh15wPCQkhLCwMgIKCAsd1S9Wn7Krna9q2tgyDBjvwGnLf4uxmPM+e9Pn0lKyekhM8J6tyNj31ndVlF4KHhobyzTffOL3qc+LECSIiIrDZbBw+fBjj+6SGYXDo0CFsNhsANpuN9PR0x3Z5eXnk5eVhs9kICwsjPDzcaT49PZ3w8HBCQ0OJiori1KlT5OfnO81HRUU1cGIRERFpzFxWmvr27cstt9zC9OnT+frrr/nggw9YtmwZjz32GP3796e4uJh58+Zx/Phx5s2bR2lpKQMGDABg1KhRbN++nU2bNpGVlcXUqVPp3bs3bdu2dcwvWrSIgwcPcvDgQRYvXszo0aMBaNu2LT179mTKlClkZWWxadMmdu7cyaOPPuqqp0JEREQaAZednrv11ltZvXo18+bNY/jw4bRq1YqJEyfym9/8BovFwvLly5k5cyZvv/02d955JytWrKB58+YAREdHM3v2bJYsWcK3337Lvffey5w5cxz7HjduHGfPniUhIYFmzZoxfPhwxowZ45hPTU0lOTmZkSNHEhISwvz58/UeTSIiIvKTLIbhKWc2619hYcNcCB4cfGuD7LuheHtfuRB80JL9fJFb7OrlmNIlPIBdib0oKvqOioqGvRC8sX0+68pTsnpKTvCcrMrZ9NSUtXq8rlz6lgNSs2bNGs/fUm5MaxUREakrlSY34+VlobLKaLS/wi8iItJUqTS5GYvFQjMvC09vOMzxMxddvRxTet8ZwpQHO7p6GSIiIg1KpclNHT9zsdFcH9Q+RK+KiYhI06eLUURERERMUGkSERERMUGlSURERMQElSYRERERE1SaRERERExQaRIRERExQaVJRERExASVJhERERETVJpERERETFBpEhERETFBpUlERETEBJUmERERERNUmkRERERMUGkSERERMUGlSURERMQElSYRERERE1SaRERERExQaRIRERExQaVJRERExASVJhERERETVJpERERETFBpEhERETFBpUlERETEBJUmERERERNUmkRERERMUGkSERERMcFlpWnr1q3ceeedV3107NgRgGPHjjFixAhsNhvDhg3j6NGjTtvv3LmTfv36YbPZmDRpEufOnXPMGYbBokWL6NGjB3FxcaSmplJVVeWYLyoqYvLkyURHR9O3b1+2b99+c0KLiIhIo+Wy0jRw4EA+/vhjx8dHH31Eu3btGD16NCUlJYwfP57Y2Fi2bt1KdHQ0EyZMoKSkBIDMzEySk5NJSEhg48aNFBcXk5SU5Nj3qlWr2LlzJ2lpaSxZsoQdO3awatUqx3xSUhIXLlxg48aNTJw4kenTp5OZmXnTnwMRERFpPFxWmvz8/AgJCXF8/Od//ieGYfDss8/y7rvv4uvry9SpU2nfvj3Jycm0aNGC3bt3A7Bu3ToGDBjA0KFD6dixI6mpqezbt4+cnBwA1q5dS2JiIrGxsfTo0YNnn32W9evXA3Dy5Ek+/PBD5s6dS4cOHRgxYgQPP/wwb731lqueChEREWkE3OKapvPnz/OnP/2JP/zhD/j4+GC324mJicFisQBgsVjo1q0bGRkZANjtdmJjYx3bt27dmvDwcOx2O6dPnyYvL4+7777bMR8TE8OpU6c4c+YMdrud1q1bExER4TR/+PDhmxNWREREGiVvVy8A4C9/+QuhoaH0798fgIKCAu644w6n+wQFBZGdnQ3AmTNnCA0NvWo+Pz+fgoICAKf54OBgAMf8tbY9ffp0rdf9faerVw2xT/lpDfmcV+/bEz6vnpLVU3KC52RVzqanpqw3mt3lpckwDDZt2sQTTzzhGCstLcXHx8fpfj4+PpSXlwNw6dKlGucvXbrkuP3DOYDy8vLr7rs2goJurfU24l4CA1vclMfxpGPFU7J6Sk7wnKzK2fTUd1aXl6YjR45w+vRpBg0a5Bjz9fW9qsSUl5fj5+f3k/NWq9WpIPn6+jr+DWC1Wq+779o4e/YChlHrzX6St7cXLVvenB/kAkVF31FZWXX9O9aRxXLli7YhjhV34ylZPSUneE5W5Wx
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"vectors = list()\n",
"vector_norms = list()\n",
"\n",
"for doc in nlp.pipe(tnse_nlp['Document'].astype('unicode').values, batch_size=300,\n",
" n_process=4):\n",
" if nlp_version == 'en_core_web_trf':\n",
" trf_vector = doc._.trf_data.tensors[-1].mean(axis=0)\n",
" trf_norm = np.linalg.norm(doc._.trf_data.tensors[-1].mean(axis=0))\n",
" norm_vector = trf_vector/trf_norm\n",
" vectors.append(norm_vector)\n",
" vector_norms.append(np.linalg.norm(norm_vector))\n",
" else:\n",
" vectors.append(doc.vector)\n",
" vector_norms.append(doc.vector_norm)\n",
"\n",
"\n",
"tnse_nlp['vector'] = vectors\n",
"tnse_nlp['vector_norm'] = vector_norms\n",
"tnse_nlp['vector_norm'].plot(kind=\"hist\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 13,
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"KeyboardInterrupt\n",
"\n"
]
}
],
"source": [
"from sklearn.manifold import TSNE\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"# % matplotlib inline\n",
"\n",
"vector_data = pd.DataFrame(tnse_nlp[\"vector\"].to_list(), index=tnse_nlp[record_col]).reset_index()\n",
"vector_data.head()\n",
"\n",
"labels = vector_data.values[:, 0]\n",
"record_vectors = vector_data.values[:, 1:]\n",
"\n",
"tsne_model = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, random_state=42, metric='cosine')\n",
"tnse_2d = tsne_model.fit_transform(record_vectors)\n",
"tnse_data = pd.DataFrame(tnse_2d, index=labels).reset_index()\n",
"tnse_data.columns = [record_col, \"TNSE-X\", \"TNSE-Y\"]\n",
"tnse_data.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"wos_plot = tnse_nlp.merge(tnse_data, on=record_col)\n",
"\n",
"g = sns.scatterplot(wos_plot, x=\"TNSE-X\", y=\"TNSE-Y\",\n",
" hue='Type', s=1)\n",
"g.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}