You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ZSI_Reconnect_China/WOS/wos_nlp_demo.ipynb

202 lines
29 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import os\n",
"import shutil\n",
"from flashgeotext.geotext import GeoText\n",
"import re\n",
"import spacy\n",
"\n",
"\n",
"nlp_version = 'en_core_web_lg' # OR: 'en_core_web_trf'\n",
"nlp = spacy.load(nlp_version)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"outputs": [],
"source": [
"outdir=\"wos_processed_data\"\n",
"record_col=\"UT (Unique WOS ID)\""
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 10,
"outputs": [],
"source": [
"kw_df = pd.read_excel(f\"{outdir}/wos_keywords.xlsx\")\n",
"wos = pd.read_excel(f\"{outdir}/wos_processed.xlsx\")\n",
"kw_df = kw_df[~kw_df[\"keyword_all\"].isna()].copy()\n",
"wos_kwd_concat = kw_df.groupby(record_col,as_index=False).agg({'keyword_all': '; '.join})"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 11,
"outputs": [],
"source": [
"kwd_nlp = pd.DataFrame(kw_df[\"keyword_all\"].drop_duplicates())\n",
"kwd_nlp = kwd_nlp.rename(columns={\"keyword_all\":\"Document\"})\n",
"kwd_nlp[\"Type\"] = \"kw\"\n",
"kwd_nlp[record_col] = \"kw_\"+(kwd_nlp.index).astype(str)\n",
"wos_nlp = wos.merge(wos_kwd_concat, on=record_col)\n",
"# wos_nlp[\"Document\"] = wos_nlp[\"keyword_all\"].fillna(\"\").str.upper()\n",
"# wos_nlp[\"Document\"] = wos_nlp[\"Article Title\"].str.cat(wos_nlp[[\"Abstract\"]].fillna(\"\"), sep=' - ').str.upper()\n",
"wos_nlp[\"Document\"] = wos_nlp[\"Article Title\"].str.cat(wos_nlp[[\"Abstract\", \"keyword_all\"]].fillna(\"\"), sep=' - ').str.upper()\n",
"wos_nlp[[record_col, \"Document\"]].drop_duplicates()\n",
"wos_nlp[\"Type\"] = \"doc\"\n",
"\n",
"tnse_nlp = pd.concat([kwd_nlp,wos_nlp], ignore_index=True)\n",
"tnse_nlp = tnse_nlp[[record_col,\"Type\",\"Document\",\"keyword_all\"]]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [
{
"data": {
"text/plain": "<Axes: ylabel='Frequency'>"
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAk0AAAGdCAYAAAAPLEfqAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAABFwElEQVR4nO3de1zUdaL/8dcgAaPEitxWxGOtrXkJB4TQNt3Uh5W30h9edq2TuVp6TKR+u2khruL9F2oXw1J385a2mpf0qK22dvFYlu2iDJqHDmabrICCYmggCHx/fxhzmpT8guAMzPv5ePDI+Xzm+53Pe/gC7+b7ZbAYhmEgIiIiIj/Jy9ULEBEREWkMVJpERERETFBpEhERETFBpUlERETEBJUmERERERNUmkRERERMUGkSERERMUGlSURERMQElSYRERERE1SaREREREzwdvUCGrOzZy9Q33+ExmKBoKBbG2Tf7kQ5mx5PyeopOcFzsipn01NT1urxulJpugGGQYMdeA25b3einE2Pp2T1lJzgOVmVs+mp76w6PSciIiJigkqTiIiIiAkqTSIiIiImqDSJiIiImKDSJCIiImKCSpOIiIiICSpNIiIiIiaoNImIiIiYoNIkIiIiYoJKk4iIiIgJKk0iIiIiJqg0iYiIiJig0iQiIiJigrerFyBNg5eXBS8vS623a9bMdb29qsqgqspD/tS3iIjcMJUmuWFeXhZ+1rI53nUoQIGBLRpgReZUVFbx7fkSFScRETFFpUlumJeXBe9mXjy94TDHz1x09XJMuSPUn1d+G42Xl0WlSURETFFpknpz/MxFvsgtdvUyREREGoQuBBcRERExQaVJRERExASVJhERERETVJpERERETFBpEhERETFBpUlERETEBJeWpry8PCZMmEC3bt3o27cvq1evdswdO3aMESNGYLPZGDZsGEePHnXadufOnfTr1w+bzcakSZM4d+6cY84wDBYtWkSPHj2Ii4sjNTWVqqoqx3xRURGTJ08mOjqavn37sn379gbPKiIiIo2bS0vTM888Q/Pmzdm6dSvTpk3j5Zdf5m9/+xslJSWMHz+e2NhYtm7dSnR0NBMmTKCkpASAzMxMkpOTSUhIYOPGjRQXF5OUlOTY76pVq9i5cydpaWksWbKEHTt2sGrVKsd8UlISFy5cYOPGjUycOJHp06eTmZl50/OLiIhI4+GyN7f89ttvycjIYM6cOdx2223cdttt9OrVi08//ZRvv/0WX19fpk6disViITk5mf/6r/9i9+7dxMfHs27dOgYMGMDQoUMBSE1NpU+fPuTk5NC2bVvWrl1LYmIisbGxADz77LO88sorjBs3jpMnT/Lhhx/y/vvvExERQYcOHcjIyOCtt96ia9eurno6RERExM257JUmPz8/rFYrW7du5fLly5w4cYJDhw7RqVMn7HY7MTExWCxX/gCsxWKhW7duZGRkAGC32x2FCKB169aEh4djt9s5ffo0eXl53H333Y75mJgYTp06xZkzZ7Db7bRu3ZqIiAin+cOHD9+c4CIiItIoueyVJl9fX2bMmMGcOXNYu3YtlZWVxMfHM2LECN5//33uuOMOp/sHBQWRnZ0NwJkzZwgNDb1qPj8/n4KCAgCn+eDgYADH/LW2PX36dK0zfN/p6lX1Phti33JtDflce9Ln01OyekpO8Jysytn01JT1RrO79G/PffXVV/Tp04ff/e53ZGdnM2fOHO655x5KS0vx8fFxuq+Pjw/l5eUAXLp0qcb5S5cuOW7/cA6gvLz8uvuujaCgW2u9jTvsW/5XYGCLm/I4nvT59JSsnpITPCercjY99Z3VZaXp008/ZfPmzezbtw8/Pz8iIyM5ffo0r7/+Om3btr2qxJSXl+Pn5wdceZXqWvNWq9WpIPn6+jr+DWC1WmvctnrftXH27AUMo9ab/SSL5conuSH23VCaNfO6aeWjvhUVfUdlZdX171hHjfHzWVeektVTcoLnZFXOpqemrNXjdeWy0nT06FHatWvnVFY6d+7MsmXLiI2NpbCw0On+hYWFjtNqYWFh15wPCQkhLCwMgIKCAsd1S9Wn7Krna9q2tgyDBjvwGnLf4uxmPM+e9Pn0lKyekhM8J6tyNj31ndVlF4KHhobyzTffOL3qc+LECSIiIrDZbBw+fBjj+6SGYXDo0CFsNhsANpuN9PR0x3Z5eXnk5eVhs9kICwsjPDzcaT49PZ3w8HBCQ0OJiori1KlT5OfnO81HRUU1cGIRERFpzFxWmvr27cstt9zC9OnT+frrr/nggw9YtmwZjz32GP3796e4uJh58+Zx/Phx5s2bR2lpKQMGDABg1KhRbN++nU2bNpGVlcXUqVPp3bs3bdu2dcwvWrSIgwcPcvDgQRYvXszo0aMBaNu2LT179mTKlClkZWWxadMmdu7cyaOPPuqqp0JEREQaAZednrv11ltZvXo18+bNY/jw4bRq1YqJEyfym9/8BovFwvLly5k5cyZvv/02d955JytWrKB58+YAREdHM3v2bJYsWcK3337Lvffey5w5cxz7HjduHGfPniUhIYFmzZoxfPhwxowZ45hPTU0lOTmZkSNHEhISwvz58/UeTSIiIvKTLIbhKWc2619hYcNcCB4cfGuD7LuheHtfuRB80JL9fJFb7OrlmNIlPIBdib0oKvqOioqGvRC8sX0+68pTsnpKTvCcrMrZ9NSUtXq8rlz6lgNSs2bNGs/fUm5MaxUREakrlSY34+VlobLKaLS/wi8iItJUqTS5GYvFQjMvC09vOMzxMxddvRxTet8ZwpQHO7p6GSIiIg1KpclNHT9zsdFcH9Q+RK+KiYhI06eLUURERERMUGkSERERMUGlSURERMQElSYRERERE1SaRERERExQaRIRERExQaVJRERExASVJhERERETVJpERERETFBpEhERETFBpUlERETEBJUmERERERNUmkRERERMUGkSERERMUGlSURERMQElSYRERERE1SaRERERExQaRIRERExQaVJRERExASVJhERERETVJpERERETFBpEhERETFBpUlERETEBJUmERERERNUmkRERERMUGkSERERMcFlpWnr1q3ceeedV3107NgRgGPHjjFixAhsNhvDhg3j6NGjTtvv3LmTfv36YbPZmDRpEufOnXPMGYbBokWL6NGjB3FxcaSmplJVVeWYLyoqYvLkyURHR9O3b1+2b99+c0KLiIhIo+Wy0jRw4EA+/vhjx8dHH31Eu3btGD16NCUlJYwfP57Y2Fi2bt1KdHQ0EyZMoKSkBIDMzEySk5NJSEhg48aNFBcXk5SU5Nj3qlWr2LlzJ2lpaSxZsoQdO3awatUqx3xSUhIXLlxg48aNTJw4kenTp5OZmXnTnwMRERFpPFxWmvz8/AgJCXF8/Od//ieGYfDss8/y7rvv4uvry9SpU2nfvj3Jycm0aNGC3bt3A7Bu3ToGDBjA0KFD6dixI6mpqezbt4+cnBwA1q5dS2JiIrGxsfTo0YNnn32W9evXA3Dy5Ek+/PBD5s6dS4cOHRgxYgQPP/wwb731lqueChEREWkE3OKapvPnz/OnP/2JP/zhD/j4+GC324mJicFisQBgsVjo1q0bGRkZANjtdmJjYx3bt27dmvDwcOx2O6dPnyYvL4+7777bMR8TE8OpU6c4c+YMdrud1q1bExER4TR/+PDhmxNWREREGiVvVy8A4C9/+QuhoaH0798fgIKCAu644w6n+wQFBZGdnQ3AmTNnCA0NvWo+Pz+fgoICAKf54OBgAMf8tbY9ffp0rdf9faerVw2xT/lpDfmcV+/bEz6vnpLVU3KC52RVzqanpqw3mt3lpckwDDZt2sQTTzzhGCstLcXHx8fpfj4+PpSXlwNw6dKlGucvXbrkuP3DOYDy8vLr7rs2goJurfU24l4CA1vclMfxpGPFU7J6Sk7wnKzK2fTUd1aXl6YjR45w+vRpBg0a5Bjz9fW9qsSUl5fj5+f3k/NWq9WpIPn6+jr+DWC1Wq+779o4e/YChlHrzX6St7cXLVvenB/kAkVF31FZWXX9O9aRxXLli7YhjhV34ylZPSUneE5W5Wx6aspaPV5XLi9N+/fvJzY2lp/97GeOsbCwMAoLC53uV1hY6DitVtN8SEgIYWFhwJVTfNXXLVWfsquer2nb2jIM6v3Aa+oHsju6Gc95Qxwr7spTsnpKTvCcrMrZ9NR3VpdfCJ6ZmUm3bt2cxmw2G4cPH8b4PqlhGBw6dAibzeaYT09Pd9w/Ly+PvLw8bDYbYWFhhIeHO82np6cTHh5OaGgoUVFRnDp1ivz8fKf5qKioBkwpIiIijZ3LS1N2dvZVF33379+f4uJi5s2bx/Hjx5k3bx6lpaUMGDAAgFGjRrF9+3Y2bdpEVlYWU6dOpXfv3rRt29Yxv2jRIg4ePMjBgwdZvHgxo0ePBqBt27b07NmTKVOmkJWVxaZNm9i5cyePPvrozQ0uIiIijYrLT88VFhYSEBDgNObv78/y5cuZOXMmb7/9NnfeeScrVqygefPmAERHRzN79myWLFnCt99+y7333sucOXMc248bN46zZ8+SkJBAs2bNGD58OGPGjHHMp6amkpyczMiRIwkJCWH+/Pl07dr1puQVERGRxsnlpammd+Lu2rUr77zzTo3bxcfHEx8ff825Zs2akZSU5PQu4T8UFBTEsmXLar9YERER8VguPz0nIiIi0hioNImIiIiYoNIkIiIiYoJKk4iIiIgJKk0iIiIiJqg0iYiIiJig0iQiIiJigkqTiIiIiAkqTSIiIiImqDSJiIiImKDSJCIiImKCSpOIiIiICSpNIiIiIiaoNImIiIiYoNIkIiIiYoJKk4iIiIgJKk0iIiIiJqg0iYiIiJig0iQiIiJigkqTiIiIiAkqTSIiIiImqDSJiIiImKDSJCIiImKCSpOIiIiICSpNIiIiIiaoNImIiIiYoNIkIiIiYoJKk4iIiIgJKk0iIiIiJqg0iYiIiJjg0tJUXl7OrFmzuPvuu/nVr37Fiy++iGEYABw7dowRI0Zgs9kYNmwYR48eddp2586d9OvXD5vNxqRJkzh37pxjzjAMFi1aRI8ePYiLiyM1NZWqqirHfFFREZMnTyY6Opq+ffuyffv2mxNYREREGi2Xlqa5c+dy4MAB3njjDRYvXszbb7/Nxo0bKSkpYfz48cTGxrJ161aio6OZMGECJSUlAGRmZpKcnExCQgIbN26kuLiYpKQkx35XrVrFzp07SUtLY8mSJezYsYNVq1Y55pOSkrhw4QIbN25k4sSJTJ8+nczMzJueX0RERBoPb1c98Pnz59myZQurVq2ia9euAIwdOxa73Y63tze+vr5MnToVi8VCcnIy//Vf/8Xu3buJj49n3bp1DBgwgKFDhwKQmppKnz59yMnJoW3btqxdu5bExERiY2MBePbZZ3nllVcYN24cJ0+e5MMPP+T9998nIiKCDh06kJGRwVtvveVYh4iIiMiPueyVpvT0dPz9/YmLi3OMjR8/ngULFmC324mJicFisQBgsVjo1q0bGRkZANjtdkchAmjdujXh4eHY7XZOnz5NXl4ed999t2M+JiaGU6dOcebMGex2O61btyYiIsJp/vDhww2cWERERBozl73SlJOTQ5s2bdi2bRvLli3j8uXLxMfHM3HiRAoKCrjjjjuc7h8UFER2djYAZ86cITQ09Kr5/Px8CgoKAJzmg4ODARzz19r29OnTtc7wfaerVw2xT/lpDfmcV+/bEz6vnpLVU3KC52RVzqanpqw3mt1lpamkpIRvvvmGDRs2sGDBAgoKCpgxYwZWq5XS0lJ8fHyc7u/j40N5eTkAly5dqnH+0qVLjts/nIMrF55fb9+1ERR0a623EfcSGNjipjyOJx0rnpLVU3KC52RVzqanvrO6rDR5e3tz8eJFFi9eTJs2bQDIzc3lL3/5C+3atbuqxJSXl+Pn5weAr6/vNeetVqtTQfL19XX8G8Bqtda4bfW+a+Ps2Qt8/8t+9cbb24uWLW/OD3KBoqLvqKysuv4d68hiufJF2xDHirvxlKyekhM8J6tyNj01Za0eryuXlaaQkBB8fX0dhQng9ttvJy8vj7i4OAoLC53uX1hY6DitFhYWds35kJAQwsLCACgoKHBct1R9yq56vqZta8swqPcDr6kfyO7oZjznDXGsuCtPyeopOcFzsipn01PfWV12IbjNZqOsrIyvv/7aMXbixAnatGmDzWbj8OHDjvdsMgyDQ4cOYbPZHNump6c7tsvLyyMvLw+bzUZYWBjh4eFO8+np6YSHhxMaGkpUVBSnTp0iPz/faT4qKqqBE4uIiEhj5rLS9Itf/ILevXuTlJREVlYW+/fvZ8WKFYwaNYr+/ftTXFzMvHnzOH78OPPmzaO0tJQBAwYAMGrUKLZv386mTZvIyspi6tSp9O7dm7Zt2zrmFy1axMGDBzl48CCLFy9m9OjRALRt25aePXsyZcoUsrKy2LRpEzt37uTRRx911VMhIiIijYDLTs8BLFq0iDlz5jBq1CisViuPPvoojz32GBaLheXLlzNz5kzefvtt7rzzTlasWEHz5s0BiI6OZvbs2SxZsoRvv/2We++9lzlz5jj2O27cOM6ePUtCQgLNmjVj+PDhjBkzxjGfmppKcnIyI0eOJCQkhPnz5+s9mkREROQnubQ03XrrraSmpl5zrmvXrrzzzjs1bhsfH098fPw155o1a0ZSUpLTu4T/UFBQEMuWLav9gkVERMRj6Q/2ioiIiJig0iQiIiJigkqTiIiIiAkqTSIiIiImqDSJiIiImKDSJCIiImKCSpOIiIiICSpNIiIiIiaoNImIiIiYoNIkIiIiYoJKk4iIiIgJKk0iIiIiJqg0iYiIiJig0iQiIiJigkqTiIiIiAkqTSIiIiImqDSJiIiImKDSJCIiImKCSpOIiIiICSpNIiIiIiaoNImIiIiYoNIkIiIiYoJKk4iIiIgJKk0iIiIiJqg0iYiIiJig0iQiIiJigkqTiIiIiAkqTSIiIiImqDSJiIiImKDSJCIiImKCSpOIiIiICS4tTX/729+48847nT4SExMBOHbsGCNGjMBmszFs2DCOHj3qtO3OnTvp168fNpuNSZMmce7cOcecYRgsWrSIHj16EBcXR2pqKlVVVY75oqIiJk+eTHR0NH379mX79u03J7CIiIg0Wi4tTcePH6dPnz58/PHHjo+5c+dSUlLC+PHjiY2NZevWrURHRzNhwgRKSkoAyMzMJDk5mYSEBDZu3EhxcTFJSUmO/a5atYqdO3eSlpbGkiVL2LFjB6tWrXLMJyUlceHCBTZu3MjEiROZPn06mZmZNz2/iIiINB51Kk2fffYZhmHc8IN/9dVXdOjQgZCQEMdHQEAA7777Lr6+vkydOpX27duTnJxMixYt2L17NwDr1q1jwIABDB06lI4dO5Kamsq+ffvIyckBYO3atSQmJhIbG0uPHj149tlnWb9+PQAnT57kww8/ZO7cuXTo0IERI0bw8MMP89Zbb91wHhEREWm66lSann76aXr16sXcuXPJyMio84N/9dVX3HbbbVeN2+12YmJisFgsAFgsFrp16+Z4LLvdTmxsrOP+rVu3Jjw8HLvdzunTp8nLy+Puu+92zMfExHDq1CnOnDmD3W6ndevWREREOM0fPny4zjlERESk6fOuy0affPIJn3zyCbt372b8+PH4+/szYMAABg0aROfOnU3twzAMvv76az7++GOWL19OZWUl/fv3JzExkYKCAu644w6n+wcFBZGdnQ3AmTNnCA0NvWo+Pz+fgoICAKf54OBgAMf8tbY9ffp07Z4E4PtOV68aYp/y0xryOa/etyd8Xj0lq6fkBM/JqpxNT01ZbzR7nUqTt7c39913H/fddx8VFRUcOHCADz74gEceeYSwsDAeeugh4uPjCQ8Pr3Efubm5lJaW4uPjw8svv8y//vUv5s6dy6VLlxzjP+Tj40N5eTkAly5dqnH+0qVLjts/nAMoLy+/7r5rIyjo1lpvI+4lMLDFTXkcTzpWPCWrp+QEz8mqnE1PfWetU2mqVl5ezv79+3nvvff46KOPCAwMpG/fvvzzn/9k0KBB/OEPf+Df//3fr7ltmzZtOHjwID/72c+wWCx06tSJqqoqpkyZQlxc3FUlpry8HD8/PwB8fX2vOW+1Wp0Kkq+vr+PfAFartcZtq/ddG2fPXqAeLu1y4u3tRcuWN+cHuUBR0XdUVlZd/451ZLFc+aJtiGPF3XhKVk/JCZ6TVTmbnpqyVo/XVZ1K0969e9m9ezcfffQRt9xyCw8++CBLly51us5o/fr1vPjiizWWJoCWLVs63W7fvj1lZWWEhIRQWFjoNFdYWOg4rRYWFnbN+ZCQEMLCwgAoKChwXLdUfcquer6mbWvLMKj3A6+pH8ju6GY85w1xrLgrT8nqKTnBc7IqZ9NT31nrdCH4c889R7NmzXjxxRf5+OOPSUlJcSpMAHfddRe/+93vatzH/v376d69O6WlpY6x//7v/6Zly5aOC7Orf0PPMAwOHTqEzWYDwGazkZ6e7tguLy+PvLw8bDYbYWFhhIeHO82np6cTHh5OaGgoUVFRnDp1ivz8fKf5qKioujwVIiIi4iHq9ErTgQMHuHjxIsXFxTRr1gyAd999l7vvvtvxio3NZnOUnGuJjo7G19eX6dOnM2nSJHJyckhNTeWJJ56gf//+LF68mHnz5vHb3/6WDRs2UFpayoABAwAYNWoUjz32GFFRUURGRjJv3jx69+5N27ZtHfOLFi3i5z//OQCLFy9m7NixALRt25aePXsyZcoUkpOTOXLkCDt37mTdunV1eSpERETEQ9TplaZDhw5x//33s2PHDsfY2rVrGThwoNMrPD/F39+fN954g3PnzjFs2DCSk5P5zW9+wxNPPIG/vz/Lly8nPT2d+Ph47HY7K1asoHnz5sCVwjV79myWLl3KqFGj+NnPfsaCBQsc+x43bhwDBw4kISGBp59+miFDhjBmzBjHfGpqKi1atGDkyJEsW7aM+fPn07Vr17o8FSIiIuIhLEYd3qVy6NChDBw4kPHjxzuNL1++nPfee48tW7bU2wLdWWFhw1wIHhjYgkFL9vNFbnH97ryBPGxrzZJR3RrVmruEB7ArsRdFRd9RUdGwF4IHB9/aIMeKu/GUrJ6SEzwnq3I2PTVlrR6vqzq90vTPf/6T/v37XzU+YMAAjh8/XufFiIiIiLirOpWmX/ziF/z1r3+9avyDDz7g3/7t3254USIiIiLupk4Xgj/zzDM89dRTfPLJJ3Tp0gWAL7/8kn/84x+8+uqr9bpAEREREXdQp1eafv3rX/POO+/QuXNnTpw4wcmTJ+nYsSO7du3ivvvuq+81ioiIiLhcnd8R/Je//CXPP/98fa5FRERExG3VqTQVFxezcuVKjhw5QkVFBT/+Bby1a9fWy+JERERE3EWdStPUqVM5cuQIDz30EP7+/vW9JhERERG3U+d3BF+3bp3eEFJEREQ8Rp0uBA8LC8PLq06bioiIiDRKdT49l5KSQmJiIu3ateOWW25xmg8PD6+XxYmIiIi4izqVpsmTJwM4/oyKxWIBwDAMLBYL//3f/11PyxMRERFxD3UqTe+//359r0NERETErdXpwqQ2bdrQpk0bSkpKOHbsGIGBgVRVVREeHk6bNm3qe40iIiIiLlenV5q+/fZbnn76aT7//HMA9uzZw7x588jJyWHFihUqTiIiItLk1OmVprlz52K1Wvnss8/w9fUFYP78+fz85z9n7ty59bpAEREREXdQp9K0f/9+fv/73xMQEOAYa9WqFUlJSfz973+vt8WJiIiIuIs6v9lSWVnZVWPnzp3D27vOf85ORERExG3VqTQNHjyYefPmkZ2djcVioaSkhM8++4w//vGPDBw4sL7XKCIiIuJydX5zyxdffJH4+HguX77MkCFDaNasGSNGjGDq1Kn1vUYRERERl6tTafLx8eH555/nmWeeIScnh8rKStq2bUuLFi3qe30iIiIibqFOpelaF3sfO3bM8e+777677isSERERcUN1Kk2PPfbYNcd9fHwICQnRO4aLiIhIk1On0pSVleV0u7KykpMnTzJnzhweeuihelmYiIiIiDup81sO/FCzZs24/fbbef7553nllVfqY5ciIiIibqVeSlO1s2fPUlxcXJ+7FBEREXELdTo9l5SUdNXYd999x4EDB+jfv/8NL0pERETE3dTb23e3bNmS5557jiFDhtTXLkVERETcRp1K04IFC+p7HSIiIiJurU6lKS0tzfR9ExIS6vIQIiIiIm6lTqXpm2++Yffu3bRs2ZK77roLHx8fsrKyOHnyJFFRUY4/2muxWOp1sSIiIiKuUqffnvPx8eGhhx7igw8+4PXXX+eVV15hz549jBkzhttvv50333yTN998k7Vr15re5/jx43n++ecdt48dO8aIESOw2WwMGzaMo0ePOt1/586d9OvXD5vNxqRJkzh37pxjzjAMFi1aRI8ePYiLiyM1NZWqqirHfFFREZMnTyY6Opq+ffuyffv2ujwNIiIi4kHqVJreffddnnjiCW655Ran8ZEjR/Luu+/Wen+7du1i3759jtslJSWMHz+e2NhYtm7dSnR0NBMmTKCkpASAzMxMkpOTSUhIYOPGjRQXFzv9Rt+qVavYuXMnaWlpLFmyhB07drBq1SrHfFJSEhcuXGDjxo1MnDiR6dOnk5mZWet1i4iIiOeoU2kKCwtj//79V43v2bOHtm3b1mpf58+fJzU1lcjISMfYu+++i6+vL1OnTqV9+/YkJyfTokULdu/eDcC6desYMGAAQ4cOpWPHjqSmprJv3z5ycnIAWLt2LYmJicTGxtKjRw+effZZ1q9fD8DJkyf58MMPmTt3Lh06dGDEiBE8/PDDvPXWW3V5KkRERMRD1Omapj/84Q8888wzfPTRR3Ts2BGAI0eOcOzYMZYtW1arfb3wwgsMGTKEM2fOOMbsdjsxMTGOa6IsFgvdunUjIyOD+Ph47HY7Tz75pOP+rVu3Jjw8HLvdjo+PD3l5eU5/NDgmJoZTp05x5swZ7HY7rVu3JiIiwml++fLldXkqRERExEPUqTTdf//9bN26la1bt/LVV1/h5+dHXFwcL730EiEhIab38+mnn/KPf/yDHTt2kJKS4hgvKCjgjjvucLpvUFAQ2dnZAJw5c4bQ0NCr5vPz8ykoKABwmg8ODgZwzF9r29OnT5ted7WGuM5d187ffA35nFfv2xM+r56S1VNygudkVc6mp6asN5q9zm9ueeedd5KUlMS3336Lv78/Xl5etfptubKyMmbOnMmMGTPw8/NzmistLcXHx8dpzMfHh/LycgAuXbpU4/ylS5cct384B1BeXn7dfddGUNCttd5G3EtgYIub8jiedKx4SlZPyQmek1U5m576zlqn0mQYBsuWLWP16tVcuHCBPXv28Morr9C8eXOmT59+VSm5lrS0NO666y569ep11Zyvr+9VJaa8vNxRrmqat1qtTgXJ19fX8W8Aq9V63X3XxtmzFzCMWm/2k7y9vWjZ8ub8IBcoKvqOysqq69+xjiyWK1+0DXGsuBtPyeopOcFzsipn01NT1urxuqpTaVq6dCm7du3i//2//8f//b//F4D/83/+DzNmzCA1NZXp06dfdx+7du2isLCQ6Oho4H+LzZ49exg8eDCFhYVO9y8sLHScVgsLC7vmfEhICGFhYcCVU3zV1y1Vn7Krnq9p29oyDOr9wGvqB7I7uhnPeUMcK+7KU7J6Sk7wnKzK2fTUd9Y6/fbcO++8w+zZs+nTp4/jlNy9997LCy+8wF//+ldT+3jzzTfZsWMH27ZtY9u2bfTt25e+ffuybds2bDYbhw8fxvg+qWEYHDp0CJvNBoDNZiM9Pd2xr7y8PPLy8rDZbISFhREeHu40n56eTnh4OKGhoURFRXHq1Cny8/Od5qOiouryVIiIiIiHqNMrTWfPnr3qYmqAgIAAx3spXU+bNm2cbrdoceWUVLt27QgKCmLx4sXMmzeP3/72t2zYsIHS0lIGDBgAwKhRo3jssceIiooiMjKSefPm0bt3b8fbHYwaNYpFixbx85//HIDFixczduxYANq2bUvPnj2ZMmUKycnJHDlyhJ07d7Ju3bq6PBUiIiLiIer0SlOPHj144403nMYuXrzIiy++SPfu3W94Uf7+/ixfvpz09HTHWwysWLGC5s2bAxAdHc3s2bNZunQpo0aN4mc/+5nTHxEeN24cAwcOJCEhgaeffpohQ4YwZswYx3xqaiotWrRg5MiRLFu2jPnz59O1a9cbXreIiIg0XRbDqP3Zvvz8fBISEsjLy6OoqIj27duTm5tLeHg4r7/+utN7IDVlhYUNcyF4YGALBi3Zzxe5xfW78wbysK01S0Z1a1Rr7hIewK7EXhQVfUdFRcNeCB4cfGuDHCvuxlOyekpO8Jysytn01JS1eryu6nR6LiAggM2bN/Ppp59y4sQJKioquP322+nZsydeXnV68UpERETErdWpNA0ePJi0tDTuuece7rnnnvpek4iIiIjbqdPLQl5eXly+fLm+1yIiIiLitur0SlPv3r353e9+R58+fWjTps1Vb2aZkJBQL4sTERERcRd1Kk1ffvklXbp04cyZM05/aBeo1Z9SEREREWksTJemRx99lNdff52AgADefPNN4MrfgKvLnx8RERERaWxMX9OUnp5+1XVMv/rVr8jJyan3RYmIiIi4mxt6f4A6vMWTiIiISKOkN1USERERMUGlSURERMSEWv323F//+lf8/f0dt6uqqvjb3/5Gq1atnO43dOjQelmciIiIiLswXZrCw8NZuXKl01hQUBDr1q1zGrNYLCpNIiIi0uSYLk0ffPBBQ65DRERExK3pmiYRERERE1SaRERERExQaRIRERExQaVJRERExASVJhERERETVJpERERETFBpEhERETFBpUlERETEBJUmERERERNUmkRERERMUGkSERERMUGlSURERMQElSYRERERE1SaRERERExQaRIRERExQaVJRERExASVJhERERETXFqavvnmG8aNG0d0dDS9e/fmz3/+s2MuJyeHMWPGEBUVxcCBA/n444+dtj1w4ACDBw/GZrMxevRocnJynOZXr15Nr169iI6OZtq0aZSWljrmysrKmDZtGrGxsfTs2ZOVK1c2bFARERFp9FxWmqqqqhg/fjyBgYG88847zJo1i9dff50dO3ZgGAaTJk0iODiYLVu2MGTIEBISEsjNzQUgNzeXSZMmER8fz+bNm2nVqhVPPfUUhmEAsGfPHtLS0pg9ezZr1qzBbrezcOFCx2OnpqZy9OhR1qxZw8yZM0lLS2P37t0ueR5ERESkcfB21QMXFhbSqVMnUlJS8Pf357bbbuOee+4hPT2d4OBgcnJy2LBhA82bN6d9+/Z8+umnbNmyhcmTJ7Np0ybuuusuxo4dC8CCBQu49957+fzzz+nevTtr167l8ccfp0+fPgDMmjWLcePGMWXKFAzDYNOmTfzpT3+iS5cudOnShezsbNavX0///v1d9XSIiIiIm3PZK02hoaG8/PLL+Pv7YxgG6enp/P3vfycuLg673U7nzp1p3ry54/4xMTFkZGQAYLfbiY2NdcxZrVa6dOlCRkYGlZWVHDlyxGk+KiqKy5cvk5WVRVZWFhUVFURHRzvt2263U1VV1fDBRUREpFFy2StNP9S3b19yc3Pp06cPDz74IPPnzyc0NNTpPkFBQeTn5wNQUFBQ43xxcTFlZWVO897e3rRs2ZL8/Hy8vLwIDAzEx8fHMR8cHExZWRnnz5+nVatWptdtsdQl7c3fp/y0hnzOq/ftCZ9XT8nqKTnBc7IqZ9NTU9Ybze4WpWnJkiUUFhaSkpLCggULKC0tdSo1AD4+PpSXlwP85PylS5cct681bxjGNecAx/7NCgq6tVb3F/cTGNjipjyOJx0rnpLVU3KC52RVzqanvrO6RWmKjIwErvxW27PPPsuwYcOcftsNrhQaPz8/AHx9fa8qOOXl5QQEBODr6+u4/eN5q9VKZWXlNecAx/7NOnv2At9fe15vvL29aNny5vwgFygq+o7KyoY7LWuxXPmibYhjxd14SlZPyQmek1U5m56aslaP15VLLwTPyMigX79+jrE77riDy5cvExISwokTJ666f/Upt7CwMAoLC6+a79SpEy1btsTX15fCwkLat28PQEVFBefPnyckJATDMCgqKqKiogJv7yvxCwoK8PPzIyAgoFYZDIN6P/Ca+oHsjm7Gc94Qx4q78pSsnpITPCercjY99Z3VZReC/+tf/yIhIYHTp087xo4ePUqrVq2IiYnhiy++cJxqA0hPT8dmswFgs9lIT093zJWWlnLs2DFsNhteXl5ERkY6zWdkZODt7U3Hjh3p1KkT3t7ejovKq/cdGRmJl5fe61NERESuzWUtITIyki5dujBt2jSOHz/Ovn37WLhwIf/xH/9BXFwcrVu3JikpiezsbFasWEFmZibDhw8HYNiwYRw6dIgVK1aQnZ1NUlISERERdO/eHYBHHnmEN954g71795KZmUlKSgojR47EarVitVoZOnQoKSkpZGZmsnfvXlauXMno0aNd9VSIiIhII+Cy03PNmjXjtddeY86cOfzmN7/BarXy2GOPMXr0aCwWC6+99hrJycnEx8fTrl07li5dSnh4OAARERG8+uqrzJ8/n6VLlxIdHc3SpUuxfH9Z/KBBgzh16hQzZsygvLycBx54gClTpjgeOykpiZSUFB5//HH8/f2ZPHkyDzzwgEueBxEREWkcLIbhKWc2619hYcNcCB4Y2IJBS/bzRW5x/e68gTxsa82SUd0a1Zq7hAewK7EXRUXfUVHRsBeCBwff2iDHirvxlKyekhM8J6tyNj01Za0erytdxCMiIiJigkqTiIiIiAkqTSIiIiImqDSJiIiImKDSJCIiImKCSpOIiIiICSpNIiIiIiaoNImIiIiYoNIkIiIiYoJKk4iIiIgJKk0iIiIiJqg0iYiIiJig0iQiIiJigkqTiIiIiAkqTSIiIiImqDSJiIiImKDSJCIiImKCSpOIiIiICSpNIiIiIiaoNImIiIiYoNIkIiIiYoJKk4iIiIgJKk0iIiIiJqg0iYiIiJig0iQiIiJigkqTiIiIiAkqTSIiIiImqDSJiIiImKDSJCIiImKCSpOIiIiICSpNIiIiIia4tDSdPn2axMRE4uLi6NWrFwsWLKCsrAyAnJwcxowZQ1RUFAMHDuTjjz922vbAgQMMHjwYm83G6NGjycnJcZpfvXo1vXr1Ijo6mmnTplFaWuqYKysrY9q0acTGxtKzZ09WrlzZ8GFFRESkUXNZaTIMg8TEREpLS1m/fj0vvfQSH374IS+//DKGYTBp0iSCg4PZsmULQ4YMISEhgdzcXAByc3OZNGkS8fHxbN68mVatWvHUU09hGAYAe/bsIS0tjdmzZ7NmzRrsdjsLFy50PHZqaipHjx5lzZo1zJw5k7S0NHbv3u2S50FEREQaB29XPfCJEyfIyMjgk08+ITg4GIDExEReeOEFfv3rX5OTk8OGDRto3rw57du359NPP2XLli1MnjyZTZs2cddddzF27FgAFixYwL333svnn39O9+7dWbt2LY8//jh9+vQBYNasWYwbN44pU6ZgGAabNm3iT3/6E126dKFLly5kZ2ezfv16+vfv76qnQ0RERNycy15pCgkJ4c9//rOjMFW7ePEidrudzp0707x5c8d4TEwMGRkZANjtdmJjYx1zVquVLl26kJGRQWVlJUeOHHGaj4qK4vLly2RlZZGVlUVFRQXR0dFO+7bb7VRVVTVQWhEREWnsXPZKU0BAAL169XLcrqqqYt26dfTo0YOCggJCQ0Od7h8UFER+fj7AT84XFxdTVlbmNO/t7U3Lli3Jz8/Hy8uLwMBAfHx8HPPBwcGUlZVx/vx5WrVqZTqDxVKryC7bp/y0hnzOq/ftCZ9XT8nqKTnBc7IqZ9NTU9Ybze6y0vRjCxcu5NixY2zevJnVq1c7lRoAHx8fysvLASgtLa1x/tKlS47b15o3DOOac4Bj/2YFBd1aq/uL+wkMbHFTHseTjhVPyeopOcFzsipn01PfWd2iNC1cuJA1a9bw0ksv0aFDB3x9fTl//rzTfcrLy/Hz8wPA19f3qoJTXl5OQEAAvr6+jts/nrdarVRWVl5zDnDs36yzZy/w/bXn9cbb24uWLW/OD3KBoqLvqKxsuNOyFsuVL9qGOFbcjadk9ZSc4DlZlbPpqSlr9Xhdubw0zZkzh7/85S8sXLiQBx98EICwsDCOHz/udL/CwkLHKbewsDAKCwuvmu/UqRMtW7bE19eXwsJC2rdvD0BFRQXnz58nJCQEwzAoKiqioqICb+8r8QsKCvDz8yMgIKBWazcM6v3Aa+oHsju6Gc95Qxwr7spTsnpKTvCcrMrZ9NR3Vpe+T1NaWhobNmzgxRdfZNCgQY5xm83GF1984TjVBpCeno7NZnPMp6enO+ZKS0s5duwYNpsNLy8vIiMjneYzMjLw9vamY8eOdOrUCW9vb8dF5dX7joyMxMtL7/UpIiIi1+aylvDVV1/x2muv8eSTTxITE0NBQYHjIy4ujtatW5OUlER2djYrVqwgMzOT4cOHAzBs2DAOHTrEihUryM7OJikpiYiICLp37w7AI488whtvvMHevXvJzMwkJSWFkSNHYrVasVqtDB06lJSUFDIzM9m7dy8rV65k9OjRrnoqREREpBFw2em5999/n8rKSl5//XVef/11p7kvv/yS1157jeTkZOLj42nXrh1Lly4lPDwcgIiICF599VXmz5/P0qVLiY6OZunSpVi+vyx+0KBBnDp1ihkzZlBeXs4DDzzAlClTHPtPSkoiJSWFxx9/HH9/fyZPnswDDzxw88KLiIhIo2MxDE85s1n/Cgsb5kLwwMAWDFqyny9yi+t35w3kYVtrlozq1qjW3CU8gF2JvSgq+o6Kioa9EDw4+NYGOVbcjadk9ZSc4DlZlbPpqSlr9Xhd6SIeERERERNUmkRERERMUGkSERERMUGlSURERMQElSYRERERE1SaRERERExQaRIRERExQaVJRERExASVJhERERETVJpERERETFBpEhERETFBpUlERETEBJUmERERERNUmkRERERMUGkSERERMUGlSURERMQElSYRERERE1SaRERERExQaRIRERExQaVJRERExASVJhERERETVJpERERETFBpEhERETFBpUlERETEBJUmERERERNUmkRERERMUGkSERERMUGlSURERMQElSYRERERE1SaRERERExwi9JUXl7O4MGDOXjwoGMsJyeHMWPGEBUVxcCBA/n444+dtjlw4ACDBw/GZrMxevRocnJynOZXr15Nr169iI6OZtq0aZSWljrmysrKmDZtGrGxsfTs2ZOVK1c2bEARERFp9FxemsrKyvj9739Pdna2Y8wwDCZNmkRwcDBbtmxhyJAhJCQkkJubC0Bubi6TJk0iPj6ezZs306pVK5566ikMwwBgz549pKWlMXv2bNasWYPdbmfhwoWO/aempnL06FHWrFnDzJkzSUtLY/fu3Tc3uIiIiDQqLi1Nx48fZ+TIkZw8edJp/LPPPiMnJ4fZs2fTvn17JkyYQFRUFFu2bAFg06ZN3HXXXYwdO5Zf/vKXLFiwgFOnTvH5558DsHbtWh5//HH69OlD165dmTVrFlu2bKG0tJSSkhI2bdpEcnIyXbp04f777+eJJ55g/fr1Nz2/iIiINB4uLU2ff/453bt3Z+PGjU7jdrudzp0707x5c8dYTEwMGRkZjvnY2FjHnNVqpUuXLmRkZFBZWcmRI0ec5qOiorh8+TJZWVlkZWVRUVFBdHS0077tdjtVVVUNlFREREQaO29XPvgjjzxyzfGCggJCQ0OdxoKCgsjPz7/ufHFxMWVlZU7z3t7etGzZkvz8fLy8vAgMDMTHx8cxHxwcTFlZGefPn6dVq1am12+xmL6rS/cpP60hn/PqfXvC59VTsnpKTvCcrMrZ9NSU9Uazu7Q01aS0tNSp1AD4+PhQXl5+3flLly45bl9r3jCMa84Bjv2bFRR0a63uL+4nMLDFTXkcTzpWPCWrp+QEz8mqnE1PfWd1y9Lk6+vL+fPnncbKy8vx8/NzzP+44JSXlxMQEICvr6/j9o/nrVYrlZWV15wDHPs36+zZC3x/7Xm98fb2omXLm/ODXKCo6DsqKxvutKzFcuWLtiGOFXfjKVk9JSd4TlblbHpqylo9XlduWZrCwsI4fvy401hhYaHjlFtYWBiFhYVXzXfq1ImWLVvi6+tLYWEh7du3B6CiooLz588TEhKCYRgUFRVRUVGBt/eV+AUFBfj5+REQEFCrdRoG9X7gNfUD2R3djOe8IY4Vd+UpWT0lJ3hOVuVseuo7q8vfcuBabDYbX3zxheNUG0B6ejo2m80xn56e7pgrLS3l2LFj2Gw2vLy8iIyMdJrPyMjA29ubjh070qlTJ7y9vR0XlVfvOzIyEi8vt3w6RERExA24ZUuIi4ujdevWJCUlkZ2dzYoVK8jMzGT48OEADBs2jEOHDrFixQqys7NJSkoiIiKC7t27A1cuMH/jjTfYu3cvmZmZpKSkMHLkSKxWK1arlaFDh5KSkkJmZiZ79+5l5cqVjB492pWRRURExM255em5Zs2a8dprr5GcnEx8fDzt2rVj6dKlhIeHAxAREcGrr77K/PnzWbp0KdHR0SxduhTL95fFDxo0iFOnTjFjxgzKy8t54IEHmDJlimP/SUlJpKSk8Pjjj+Pv78/kyZN54IEHXJJVREREGge3KU1ffvml0+127dqxbt26Gu9/3333cd9999U4P378eMaPH3/NOavVygsvvMALL7xQt8WKiIiIx3HL03MiIiIi7kalSURERMQElSYRERERE1SaRERERExQaRIRERExwW1+e07EFZo1uzn/31Bfj1NVZVBV5SFv5Ssi4mZUmsQjhfj7UlllEBBgvSmPV19/GLiisopvz5eoOImIuIBKk3ikAKs3zbwsPL3hMMfPXHT1cky5I9SfV34bjZeXRaVJRMQFVJrEox0/c5EvcotdvQwREWkEdCG4iIiIiAkqTSIiIiImqDSJiIiImKDSJCIiImKCSpOIiIiICSpNIiIiIiaoNImIiIiYoNIkIiIiYoJKk4iIiIgJKk0iIiIiJqg0iYiIiJig0iQiIiJigkqTiIiIiAkqTSIiIiImqDSJiIiImKDSJCIiImKCSpOIiIiICSpNIiIiIiaoNImIiIiY4O3qBYhI7TRr5t7/r/Pj9VVVGVRVGS5ajYhI/VFpEmkkQvx9qawyCAiwunopPykwsIXT7YrKKr49X6LiJCKNnseWprKyMmbNmsV7772Hn58fY8eOZezYsa5elkiNAqzeNPOy8PSGwxw/c9HVyzHljlB/XvltNF5eFpUmEWn0PLY0paamcvToUdasWUNubi7PPfcc4eHh9O/f39VLE/lJx89c5IvcYlcvQ0TE43hkaSopKWHTpk386U9/okuXLnTp0oXs7GzWr1+v0iQiIiLX5JGlKSsri4qKCqKjox1jMTExLFu2jKqqKry83PtCW5HGxt0vXv8xXbwuItfikaWpoKCAwMBAfHx8HGPBwcGUlZVx/vx5WrVqZWo/Xl5g1PP3VYvlyn+7hAdg9WlWvztvIO1D/AGtuaE1xjXH/FvLRnHx+o9VVFbx3cVL1yxO1V+j3t5e9f71fyMM43/XVl8aOmtDrLkuapPTXdZcG9Vrdtdj91oMw8C4gUVWZ/3xz+kb/dxZjBtZVSO1bds2XnnlFT788EPHWE5ODv369WPfvn38/Oc/d+HqRERExB01rtfM64mvry/l5eVOY9W3/fz8XLEkERERcXMeWZrCwsIoKiqioqLCMVZQUICfnx8BAQEuXJmIiIi4K48sTZ06dcLb25uMjAzHWHp6OpGRkboIXERERK7JIxuC1Wpl6NChpKSkkJmZyd69e1m5ciWjR4929dJERETETXnkheAApaWlpKSk8N577+Hv78+4ceMYM2aMq5clIiIibspjS5OIiIhIbXjk6TkRERGR2lJpEhERETFBpUlERETEBJUmN1FWVsa0adOIjY2lZ8+erFy50tVLqjenT58mMTGRuLg4evXqxYIFCygrKwOuvBP7mDFjiIqKYuDAgXz88ccuXm39GD9+PM8//7zj9rFjxxgxYgQ2m41hw4Zx9OhRF67uxpWXlzNr1izuvvtufvWrX/Hiiy86/uRBU8qal5fHhAkT6NatG3379mX16tWOuaaSs7y8nMGDB3Pw4EHH2PW+Lg8cOMDgwYOx2WyMHj2anJycm73sWrtWzoyMDH77298SHR3Ngw8+yKZNm5y2aSo5q124cIFevXqxdetWp/GdO3fSr18/bDYbkyZN4ty5czdruTfkWllzc3N58sknsdls3H///bz77rtO29xoVpUmN5GamsrRo0dZs2YNM2fOJC0tjd27d7t6WTfMMAwSExMpLS1l/fr1vPTSS3z44Ye8/PLLGIbBpEmTCA4OZsuWLQwZMoSEhARyc3NdvewbsmvXLvbt2+e4XVJSwvjx44mNjWXr1q1ER0czYcIESkpKXLjKGzN37lwOHDjAG2+8weLFi3n77bfZuHFjk8v6zDPP0Lx5c7Zu3cq0adN4+eWX+dvf/tZkcpaVlfH73/+e7Oxsx9j1vi5zc3OZNGkS8fHxbN68mVatWvHUU0/d0N8Ja2jXyllQUMCTTz5JXFwc77zzDomJicyZM4ePPvoIaDo5f2jhwoWcOXPGaSwzM5Pk5GQSEhLYuHEjxcXFJCUl3Yzl3pBrZa2oqGDChAl4e3vzzjvvMG7cOKZOncr//M//APWU1RCX++6774zIyEjjs88+c4wtXbrU+Pd//3cXrqp+HD9+3OjQoYNRUFDgGNuxY4fRs2dP48CBA0ZUVJTx3XffOeYef/xxY8mSJa5Yar0oKioyfv3rXxvDhg0znnvuOcMwDGPTpk1G3759jaqqKsMwDKOqqsq4//77jS1btrhyqXVWVFRkdO7c2Th48KBjbPny5cbzzz/fpLKeP3/e6NChg/Hll186xhISEoxZs2Y1iZzZ2dnGww8/bDz00ENGhw4dHN9/rvd1+fLLLzt9byopKTGio6Odvn+5k5pyvvXWW0b//v2d7vvHP/7R+P3vf28YRtPJWe3vf/+7cf/99xv33nuv03E6ZcoUx/cqwzCM3Nxc48477zROnjx509ZeWzVl3bt3rxETE2NcuHDBcd+JEycaGzZsMAyjfrLqlSY3kJWVRUVFBdHR0Y6xmJgY7HY7VVVVLlzZjQsJCeHPf/4zwcHBTuMXL17EbrfTuXNnmjdv7hiPiYlxeqf2xuaFF15gyJAh3HHHHY4xu91OTEwMlu//vLbFYqFbt26NNmd6ejr+/v7ExcU5xsaPH8+CBQuaVFY/Pz+sVitbt27l8uXLnDhxgkOHDtGpU6cmkfPzzz+ne/fubNy40Wn8el+Xdrud2NhYx5zVaqVLly5um72mnNWXCvzYxYsXgaaTE66cxvrjH//IjBkz8PHxcZr7cc7WrVsTHh6O3W5v8DXXVU1ZP//8c+655x78/f0dY6+99hq/+c1vgPrJ6n2Da5d6UFBQQGBgoNPBHBwcTFlZGefPn6dVq1YuXN2NCQgIoFevXo7bVVVVrFu3jh49elBQUEBoaKjT/YOCgsjPz7/Zy6wXn376Kf/4xz/YsWMHKSkpjvGCggKnEgVXctb0Erq7y8nJoU2bNmzbto1ly5Zx+fJl4uPjmThxYpPK6uvry4wZM5gzZw5r166lsrKS+Ph4RowYwfvvv9/ocz7yyCPXHL/e12Vj+7qtKWdERAQRERGO22fPnmXXrl1MnjwZaDo5AZYtW0bnzp3p2bPnVXNnzpxpVDmh5qzV35sWLVrE9u3bCQwMJDExkX79+gH1k1WlyQ2UlpZe1f6rb5eXl7tiSQ1m4cKFHDt2jM2bN7N69epr5m6MmcvKypg5cyYzZszAz8/Paa6mz29jzAlXrtH65ptv2LBhAwsWLKCgoIAZM2ZgtVqbXNavvvqKPn368Lvf/Y7s7GzmzJnDPffc0+Ry/tD1sjXF7JcuXWLy5MkEBwc7XpVoKjmPHz/Ohg0b+M///M9rzl+6dKlJ5IQr35veeecdBg4cyLJlyzh48CCJiYls3LiRyMjIesmq0uQGfH19r/qkVd/+8Q/gxmzhwoWsWbOGl156iQ4dOuDr68v58+ed7lNeXt4oM6elpXHXXXc5vapWrabPb2PMCeDt7c3FixdZvHgxbdq0Aa5cNPuXv/yFdu3aNZmsn376KZs3b2bfvn34+fkRGRnJ6dOnef3112nbtm2Tyflj1/u6rOl4DggIuFlLrFffffcdTz31FP/85z956623sFqtQNPIaRgG06dPJzEx8apLJKrVlLP6eWhMmjVrRsuWLUlJScHLy4suXbrwj3/8g7fffpvIyMh6yaprmtxAWFgYRUVFVFRUOMYKCgrw8/NrVF+gP2XOnDmsWrWKhQsX8uCDDwJXchcWFjrdr7Cw8KqXTxuDXbt2sXfvXqKjo4mOjmbHjh3s2LGD6OjoJpUTrlyn5uvr6yhMALfffjt5eXlNKuvRo0dp166dUxHq3Lkzubm5TSrnj10vW03zISEhN22N9eXixYuMGzeO7Oxs1qxZw2233eaYawo5c3NzOXz4MC+88ILje1Nubi4zZ87kiSeeAJpGzmqhoaHcdttteHn9b7Wp/t4E9ZNVpckNdOrUCW9vb6cLDNPT04mMjHT65DdWaWlpbNiwgRdffJFBgwY5xm02G1988QWXLl1yjKWnp2Oz2VyxzBvy5ptvsmPHDrZt28a2bdvo27cvffv2Zdu2bdhsNg4fPuz4VWXDMDh06FCjzAlXPm9lZWV8/fXXjrETJ07Qpk2bJpU1NDSUb775xun/TE+cOEFERESTyvlj1/u6tNlspKenO+ZKS0s5duxYo8teVVVFQkIC//rXv3jzzTf55S9/6TTfFHKGhYXx3nvvOb4vbdu2jdDQUBITE5k3bx5wdc68vDzy8vIaVc5qNpuN7OxsKisrHWNfffWV43/w6iNr4/+J3ARYrVaGDh1KSkoKmZmZ7N27l5UrVzJ69GhXL+2GffXVV7z22ms8+eSTxMTEUFBQ4PiIi4ujdevWJCUlkZ2dzYoVK8jMzGT48OGuXnattWnThnbt2jk+WrRoQYsWLWjXrh39+/enuLiYefPmcfz4cebNm0dpaSkDBgxw9bLr5Be/+AW9e/cmKSmJrKws9u/fz4oVKxg1alSTytq3b19uueUWpk+fztdff80HH3zAsmXLeOyxx5pUzh+73tflsGHDOHToECtWrCA7O5ukpCQiIiLo3r27i1deO5s3b+bgwYPMnTuXgIAAx/el6lOTTSGnt7e30/eldu3a4e3tTVBQEGFhYQCMGjWK7du3s2nTJrKyspg6dSq9e/embdu2Ll597Q0ePJiqqipmzZrFN998w/r169m/fz8jR44E6inrjb5fgtSPkpISY+rUqUZUVJTRs2dPY9WqVa5eUr1Yvny50aFDh2t+GIZh/POf/zQeffRR46677jIGDRpkfPLJJy5ecf147rnnnN4PxG63G0OHDjUiIyON4cOHG1988YULV3fjiouLjSlTphhRUVHGPffcY7z66quO9yxqSlmzs7ONMWPGGN26dTP69etnrFq1qknm/PH7+lzv6/Kjjz4yHnjgAaNr167G448/7tbv6fNDP8w5duzYa35f+uF7MzWFnD/Wp0+fq95PbMuWLcZ9991nREVFGZMmTTLOnTt3M5ZZL36cNTs723HsPvDAA8aePXuc7n+jWS2G4cZvbyoiIiLiJnR6TkRERMQElSYRERERE1SaRERERExQaRIRERExQaVJRERExASVJhERERETVJpERERETFBpEhERETFBpUlERETEBJUmERERERNUmkRERERMUGkSERERMeH/AxydM34G67DhAAAAAElFTkSuQmCC\n"
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"vectors = list()\n",
"vector_norms = list()\n",
"\n",
"for doc in nlp.pipe(tnse_nlp['Document'].astype('unicode').values, batch_size=300,\n",
" n_process=4):\n",
" if nlp_version == 'en_core_web_trf':\n",
" trf_vector = doc._.trf_data.tensors[-1].mean(axis=0)\n",
" trf_norm = np.linalg.norm(doc._.trf_data.tensors[-1].mean(axis=0))\n",
" norm_vector = trf_vector/trf_norm\n",
" vectors.append(norm_vector)\n",
" vector_norms.append(np.linalg.norm(norm_vector))\n",
" else:\n",
" vectors.append(doc.vector)\n",
" vector_norms.append(doc.vector_norm)\n",
"\n",
"\n",
"tnse_nlp['vector'] = vectors\n",
"tnse_nlp['vector_norm'] = vector_norms\n",
"tnse_nlp['vector_norm'].plot(kind=\"hist\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 13,
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"KeyboardInterrupt\n",
"\n"
]
}
],
"source": [
"from sklearn.manifold import TSNE\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"# % matplotlib inline\n",
"\n",
"vector_data = pd.DataFrame(tnse_nlp[\"vector\"].to_list(), index=tnse_nlp[record_col]).reset_index()\n",
"vector_data.head()\n",
"\n",
"labels = vector_data.values[:, 0]\n",
"record_vectors = vector_data.values[:, 1:]\n",
"\n",
"tsne_model = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, random_state=42, metric='cosine')\n",
"tnse_2d = tsne_model.fit_transform(record_vectors)\n",
"tnse_data = pd.DataFrame(tnse_2d, index=labels).reset_index()\n",
"tnse_data.columns = [record_col, \"TNSE-X\", \"TNSE-Y\"]\n",
"tnse_data.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"wos_plot = tnse_nlp.merge(tnse_data, on=record_col)\n",
"\n",
"g = sns.scatterplot(wos_plot, x=\"TNSE-X\", y=\"TNSE-Y\",\n",
" hue='Type', s=1)\n",
"g.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}