You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
202 lines
29 KiB
Plaintext
202 lines
29 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import numpy as np\n",
|
|
"import pandas as pd\n",
|
|
"import os\n",
|
|
"import shutil\n",
|
|
"from flashgeotext.geotext import GeoText\n",
|
|
"import re\n",
|
|
"import spacy\n",
|
|
"\n",
|
|
"\n",
|
|
"nlp_version = 'en_core_web_lg' # OR: 'en_core_web_trf'\n",
|
|
"nlp = spacy.load(nlp_version)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"outputs": [],
|
|
"source": [
|
|
"outdir=\"wos_processed_data\"\n",
|
|
"record_col=\"UT (Unique WOS ID)\""
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"outputs": [],
|
|
"source": [
|
|
"kw_df = pd.read_excel(f\"{outdir}/wos_keywords.xlsx\")\n",
|
|
"wos = pd.read_excel(f\"{outdir}/wos_processed.xlsx\")\n",
|
|
"kw_df = kw_df[~kw_df[\"keyword_all\"].isna()].copy()\n",
|
|
"wos_kwd_concat = kw_df.groupby(record_col,as_index=False).agg({'keyword_all': '; '.join})"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"outputs": [],
|
|
"source": [
|
|
"kwd_nlp = pd.DataFrame(kw_df[\"keyword_all\"].drop_duplicates())\n",
|
|
"kwd_nlp = kwd_nlp.rename(columns={\"keyword_all\":\"Document\"})\n",
|
|
"kwd_nlp[\"Type\"] = \"kw\"\n",
|
|
"kwd_nlp[record_col] = \"kw_\"+(kwd_nlp.index).astype(str)\n",
|
|
"wos_nlp = wos.merge(wos_kwd_concat, on=record_col)\n",
|
|
"# wos_nlp[\"Document\"] = wos_nlp[\"keyword_all\"].fillna(\"\").str.upper()\n",
|
|
"# wos_nlp[\"Document\"] = wos_nlp[\"Article Title\"].str.cat(wos_nlp[[\"Abstract\"]].fillna(\"\"), sep=' - ').str.upper()\n",
|
|
"wos_nlp[\"Document\"] = wos_nlp[\"Article Title\"].str.cat(wos_nlp[[\"Abstract\", \"keyword_all\"]].fillna(\"\"), sep=' - ').str.upper()\n",
|
|
"wos_nlp[[record_col, \"Document\"]].drop_duplicates()\n",
|
|
"wos_nlp[\"Type\"] = \"doc\"\n",
|
|
"\n",
|
|
"tnse_nlp = pd.concat([kwd_nlp,wos_nlp], ignore_index=True)\n",
|
|
"tnse_nlp = tnse_nlp[[record_col,\"Type\",\"Document\",\"keyword_all\"]]"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": "<Axes: ylabel='Frequency'>"
|
|
},
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": "<Figure size 640x480 with 1 Axes>",
|
|
"image/png": "\n"
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"vectors = list()\n",
|
|
"vector_norms = list()\n",
|
|
"\n",
|
|
"for doc in nlp.pipe(tnse_nlp['Document'].astype('unicode').values, batch_size=300,\n",
|
|
" n_process=4):\n",
|
|
" if nlp_version == 'en_core_web_trf':\n",
|
|
" trf_vector = doc._.trf_data.tensors[-1].mean(axis=0)\n",
|
|
" trf_norm = np.linalg.norm(doc._.trf_data.tensors[-1].mean(axis=0))\n",
|
|
" norm_vector = trf_vector/trf_norm\n",
|
|
" vectors.append(norm_vector)\n",
|
|
" vector_norms.append(np.linalg.norm(norm_vector))\n",
|
|
" else:\n",
|
|
" vectors.append(doc.vector)\n",
|
|
" vector_norms.append(doc.vector_norm)\n",
|
|
"\n",
|
|
"\n",
|
|
"tnse_nlp['vector'] = vectors\n",
|
|
"tnse_nlp['vector_norm'] = vector_norms\n",
|
|
"tnse_nlp['vector_norm'].plot(kind=\"hist\")"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\n",
|
|
"KeyboardInterrupt\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from sklearn.manifold import TSNE\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"import seaborn as sns\n",
|
|
"# % matplotlib inline\n",
|
|
"\n",
|
|
"vector_data = pd.DataFrame(tnse_nlp[\"vector\"].to_list(), index=tnse_nlp[record_col]).reset_index()\n",
|
|
"vector_data.head()\n",
|
|
"\n",
|
|
"labels = vector_data.values[:, 0]\n",
|
|
"record_vectors = vector_data.values[:, 1:]\n",
|
|
"\n",
|
|
"tsne_model = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, random_state=42, metric='cosine')\n",
|
|
"tnse_2d = tsne_model.fit_transform(record_vectors)\n",
|
|
"tnse_data = pd.DataFrame(tnse_2d, index=labels).reset_index()\n",
|
|
"tnse_data.columns = [record_col, \"TNSE-X\", \"TNSE-Y\"]\n",
|
|
"tnse_data.head()"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"wos_plot = tnse_nlp.merge(tnse_data, on=record_col)\n",
|
|
"\n",
|
|
"g = sns.scatterplot(wos_plot, x=\"TNSE-X\", y=\"TNSE-Y\",\n",
|
|
" hue='Type', s=1)\n",
|
|
"g.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 2
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython2",
|
|
"version": "2.7.6"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 0
|
|
}
|