You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ZSI_Reconnect_China/WOS/wos_nlp_demo.ipynb

202 lines
29 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import os\n",
"import shutil\n",
"from flashgeotext.geotext import GeoText\n",
"import re\n",
"import spacy\n",
"\n",
"\n",
"nlp_version = 'en_core_web_lg' # OR: 'en_core_web_trf'\n",
"nlp = spacy.load(nlp_version)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"outputs": [],
"source": [
"outdir=\"wos_processed_data\"\n",
"record_col=\"UT (Unique WOS ID)\""
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 10,
"outputs": [],
"source": [
"kw_df = pd.read_excel(f\"{outdir}/wos_keywords.xlsx\")\n",
"wos = pd.read_excel(f\"{outdir}/wos_processed.xlsx\")\n",
"kw_df = kw_df[~kw_df[\"keyword_all\"].isna()].copy()\n",
"wos_kwd_concat = kw_df.groupby(record_col,as_index=False).agg({'keyword_all': '; '.join})"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 11,
"outputs": [],
"source": [
"kwd_nlp = pd.DataFrame(kw_df[\"keyword_all\"].drop_duplicates())\n",
"kwd_nlp = kwd_nlp.rename(columns={\"keyword_all\":\"Document\"})\n",
"kwd_nlp[\"Type\"] = \"kw\"\n",
"kwd_nlp[record_col] = \"kw_\"+(kwd_nlp.index).astype(str)\n",
"wos_nlp = wos.merge(wos_kwd_concat, on=record_col)\n",
"# wos_nlp[\"Document\"] = wos_nlp[\"keyword_all\"].fillna(\"\").str.upper()\n",
"# wos_nlp[\"Document\"] = wos_nlp[\"Article Title\"].str.cat(wos_nlp[[\"Abstract\"]].fillna(\"\"), sep=' - ').str.upper()\n",
"wos_nlp[\"Document\"] = wos_nlp[\"Article Title\"].str.cat(wos_nlp[[\"Abstract\", \"keyword_all\"]].fillna(\"\"), sep=' - ').str.upper()\n",
"wos_nlp[[record_col, \"Document\"]].drop_duplicates()\n",
"wos_nlp[\"Type\"] = \"doc\"\n",
"\n",
"tnse_nlp = pd.concat([kwd_nlp,wos_nlp], ignore_index=True)\n",
"tnse_nlp = tnse_nlp[[record_col,\"Type\",\"Document\",\"keyword_all\"]]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [
{
"data": {
"text/plain": "<Axes: ylabel='Frequency'>"
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "\n"
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"vectors = list()\n",
"vector_norms = list()\n",
"\n",
"for doc in nlp.pipe(tnse_nlp['Document'].astype('unicode').values, batch_size=300,\n",
" n_process=4):\n",
" if nlp_version == 'en_core_web_trf':\n",
" trf_vector = doc._.trf_data.tensors[-1].mean(axis=0)\n",
" trf_norm = np.linalg.norm(doc._.trf_data.tensors[-1].mean(axis=0))\n",
" norm_vector = trf_vector/trf_norm\n",
" vectors.append(norm_vector)\n",
" vector_norms.append(np.linalg.norm(norm_vector))\n",
" else:\n",
" vectors.append(doc.vector)\n",
" vector_norms.append(doc.vector_norm)\n",
"\n",
"\n",
"tnse_nlp['vector'] = vectors\n",
"tnse_nlp['vector_norm'] = vector_norms\n",
"tnse_nlp['vector_norm'].plot(kind=\"hist\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 13,
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"KeyboardInterrupt\n",
"\n"
]
}
],
"source": [
"from sklearn.manifold import TSNE\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"# % matplotlib inline\n",
"\n",
"vector_data = pd.DataFrame(tnse_nlp[\"vector\"].to_list(), index=tnse_nlp[record_col]).reset_index()\n",
"vector_data.head()\n",
"\n",
"labels = vector_data.values[:, 0]\n",
"record_vectors = vector_data.values[:, 1:]\n",
"\n",
"tsne_model = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, random_state=42, metric='cosine')\n",
"tnse_2d = tsne_model.fit_transform(record_vectors)\n",
"tnse_data = pd.DataFrame(tnse_2d, index=labels).reset_index()\n",
"tnse_data.columns = [record_col, \"TNSE-X\", \"TNSE-Y\"]\n",
"tnse_data.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"wos_plot = tnse_nlp.merge(tnse_data, on=record_col)\n",
"\n",
"g = sns.scatterplot(wos_plot, x=\"TNSE-X\", y=\"TNSE-Y\",\n",
" hue='Type', s=1)\n",
"g.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}