You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
blabla/WOS/wos_extract/wos_search_kw_analysis.ipynb

161 lines
6.7 KiB
Plaintext

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "code",
"execution_count": 72,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import seaborn as sns\n",
"import os\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 73,
"outputs": [],
"source": [
"agg_df = pd.DataFrame()\n",
"\n",
"workdir_path = 'wos_downloads/aggregated'\n",
"for root, dirs, files in os.walk(workdir_path):\n",
" for filename in files:\n",
" if 'analyze_' in filename:\n",
" path=os.path.join(root, filename)\n",
" with open(os.path.join(root, 'query.txt'),'r') as f:\n",
" query = f.readline()\n",
" chunk = pd.read_csv(path, sep='\\t')[[\"Publication Years\",\"Record Count\"]]\n",
" chunk[\"name\"] = filename.replace(\".txt\",\"\")\n",
" chunk[\"query\"] = query\n",
" agg_df = pd.concat([chunk,agg_df],ignore_index=True)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 74,
"outputs": [],
"source": [
"agg_df[\"region\"] = agg_df[\"query\"].apply(lambda x: \"EU+China\" if \"CU\" in x else \"Global\")\n",
"agg_df[\"kw_token\"] = agg_df[\"query\"].apply(lambda x: x.split(\"TS=(\")[-1].split(\")\")[0])\n",
"agg_df[\"kw_token\"] = agg_df[\"kw_token\"].apply(lambda x: \"OR COMPOSITE\" if \" OR \" in x else x)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 83,
"outputs": [],
"source": [
"agg_df = agg_df[~agg_df[\"Record Count\"].isna()]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 62,
"outputs": [
{
"data": {
"text/plain": " query Record Count\n0 CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST... 972.0\n1 CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST... 451.0\n2 CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST... 12.0\n3 CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST... 5.0\n4 CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST... 2631.0\n.. ... ...\n275 TS=(\"ubiquitous computing\") AND PY=(2011-2022) 3655.0\n276 TS=(\"unstructured data*\") AND PY=(2011-2022) 3386.0\n277 TS=(\"unsupervised deep learning\") AND PY=(2011... 728.0\n278 TS=(\"word embedding*\") AND PY=(2011-2022) 7068.0\n279 TS=(\"word vector*\") AND PY=(2011-2022) 1747.0\n\n[280 rows x 2 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>query</th>\n <th>Record Count</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...</td>\n <td>972.0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...</td>\n <td>451.0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...</td>\n <td>12.0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...</td>\n <td>5.0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...</td>\n <td>2631.0</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>275</th>\n <td>TS=(\"ubiquitous computing\") AND PY=(2011-2022)</td>\n <td>3655.0</td>\n </tr>\n <tr>\n <th>276</th>\n <td>TS=(\"unstructured data*\") AND PY=(2011-2022)</td>\n <td>3386.0</td>\n </tr>\n <tr>\n <th>277</th>\n <td>TS=(\"unsupervised deep learning\") AND PY=(2011...</td>\n <td>728.0</td>\n </tr>\n <tr>\n <th>278</th>\n <td>TS=(\"word embedding*\") AND PY=(2011-2022)</td>\n <td>7068.0</td>\n </tr>\n <tr>\n <th>279</th>\n <td>TS=(\"word vector*\") AND PY=(2011-2022)</td>\n <td>1747.0</td>\n </tr>\n </tbody>\n</table>\n<p>280 rows × 2 columns</p>\n</div>"
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"agg_df.groupby(\"query\",as_index=False)[\"Record Count\"].sum()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 63,
"outputs": [],
"source": [
"# agg_df = agg_df[agg_df[\"Publication Years\"].str.startswith(\"20\", na=False)].copy()\n",
"# agg_df[\"Publication Years\"] = agg_df[\"Publication Years\"].astype(int)\n",
"# agg_df[((agg_df[\"Publication Years\"]>2010) & (agg_df[\"Publication Years\"]<2023))]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 84,
"outputs": [
{
"data": {
"text/plain": "Publication Years\n2022 314\n2019 305\n2021 305\n2020 302\n2018 296\n2017 287\n2016 281\n2015 271\n2014 258\n2013 251\n2012 233\n2011 224\n2023 52\n2017 4\n2014 4\n2019 4\n2021 4\n2018 4\n2020 4\n2022 4\n2016 3\n2015 3\n2013 3\n2012 3\n2011 3\n2023 2\nName: count, dtype: int64"
},
"execution_count": 84,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"agg_df[\"Publication Years\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 64,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 85,
"outputs": [],
"source": [
"agg_df.to_excel(r'C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\WOS\\wos_processed_data\\query_yearly_agg.xlsx', index=False)"
],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}