You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
blabla/WOS/wos_extract/wos_search_kw_analysis.ipynb

158 lines
8.6 KiB
Plaintext

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "code",
"execution_count": 59,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import seaborn as sns\n",
"import os\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 60,
"outputs": [],
"source": [
"agg_df = pd.DataFrame()\n",
"\n",
"workdir_path = 'wos_downloads/aggregated'\n",
"for root, dirs, files in os.walk(workdir_path):\n",
" for filename in files:\n",
" if 'analyze_' in filename:\n",
" path=os.path.join(root, filename)\n",
" with open(os.path.join(root, 'query.txt'),'r') as f:\n",
" query = f.readline()\n",
" chunk = pd.read_csv(path, sep='\\t')[[\"Publication Years\",\"Record Count\"]]\n",
" chunk[\"name\"] = filename.replace(\".txt\",\"\")\n",
" chunk[\"query\"] = query\n",
" agg_df = pd.concat([chunk,agg_df],ignore_index=True)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 61,
"outputs": [],
"source": [
"agg_df[\"region\"] = agg_df[\"query\"].apply(lambda x: \"EU+China\" if \"CU\" in x else \"Global\")\n",
"agg_df[\"kw_token\"] = agg_df[\"query\"].apply(lambda x: x.split(\"TS=(\")[-1].split(\")\")[0])"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 61,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 62,
"outputs": [
{
"data": {
"text/plain": " query Record Count\n0 CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST... 972.0\n1 CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST... 451.0\n2 CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST... 12.0\n3 CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST... 5.0\n4 CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST... 2631.0\n.. ... ...\n275 TS=(\"ubiquitous computing\") AND PY=(2011-2022) 3655.0\n276 TS=(\"unstructured data*\") AND PY=(2011-2022) 3386.0\n277 TS=(\"unsupervised deep learning\") AND PY=(2011... 728.0\n278 TS=(\"word embedding*\") AND PY=(2011-2022) 7068.0\n279 TS=(\"word vector*\") AND PY=(2011-2022) 1747.0\n\n[280 rows x 2 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>query</th>\n <th>Record Count</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...</td>\n <td>972.0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...</td>\n <td>451.0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...</td>\n <td>12.0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...</td>\n <td>5.0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...</td>\n <td>2631.0</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>275</th>\n <td>TS=(\"ubiquitous computing\") AND PY=(2011-2022)</td>\n <td>3655.0</td>\n </tr>\n <tr>\n <th>276</th>\n <td>TS=(\"unstructured data*\") AND PY=(2011-2022)</td>\n <td>3386.0</td>\n </tr>\n <tr>\n <th>277</th>\n <td>TS=(\"unsupervised deep learning\") AND PY=(2011...</td>\n <td>728.0</td>\n </tr>\n <tr>\n <th>278</th>\n <td>TS=(\"word embedding*\") AND PY=(2011-2022)</td>\n <td>7068.0</td>\n </tr>\n <tr>\n <th>279</th>\n <td>TS=(\"word vector*\") AND PY=(2011-2022)</td>\n <td>1747.0</td>\n </tr>\n </tbody>\n</table>\n<p>280 rows × 2 columns</p>\n</div>"
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"agg_df.groupby(\"query\",as_index=False)[\"Record Count\"].sum()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 63,
"outputs": [],
"source": [
"# agg_df = agg_df[agg_df[\"Publication Years\"].str.startswith(\"20\", na=False)].copy()\n",
"# agg_df[\"Publication Years\"] = agg_df[\"Publication Years\"].astype(int)\n",
"# agg_df[((agg_df[\"Publication Years\"]>2010) & (agg_df[\"Publication Years\"]<2023))]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 64,
"outputs": [
{
"data": {
"text/plain": "Publication Years\n2022 268\n2021 260\n2019 258\n2020 258\n2018 250\n2017 243\n2016 237\n2015 227\n2014 215\n2013 208\n2012 193\n2011 184\n2023 44\n2014 4\n2019 4\n2017 4\n2018 4\n2020 4\n2022 4\n2021 4\n2016 3\n2015 3\n2013 3\n2012 3\n2011 3\n2023 2\nShowing 25 out of 29 entries 1\nShowing 25 out of 205 entries 1\n8 record(s) (0.025%) do not contain data in the field being analyzed 1\nShowing 25 out of 85 entries 1\nShowing 25 out of 189 entries 1\n1 record(s) (0.011%) do not contain data in the field being analyzed 1\nName: count, dtype: int64"
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"agg_df[\"Publication Years\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 64,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 65,
"outputs": [],
"source": [
"agg_df.to_excel(r'C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\WOS\\wos_processed_data\\query_yearly_agg.xlsx', index=False)"
],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}