You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ZSI_Reconnect_China/WOS/wos_extract/wos_search_kw_analysis.ipynb

152 lines
6.2 KiB
Plaintext

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import seaborn as sns\n",
"import os\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [],
"source": [
"agg_df = pd.DataFrame()\n",
"\n",
"workdir_path = 'wos_downloads/aggregated'\n",
"for root, dirs, files in os.walk(workdir_path):\n",
" for filename in files:\n",
" if 'analyze_' in filename:\n",
" path=os.path.join(root, filename)\n",
" with open(os.path.join(root, 'query.txt'),'r') as f:\n",
" query = f.readline()\n",
" chunk = pd.read_csv(path, sep='\\t')[[\"Publication Years\",\"Record Count\"]]\n",
" chunk[\"name\"] = filename.replace(\".txt\",\"\")\n",
" chunk[\"query\"] = query\n",
" agg_df = pd.concat([chunk,agg_df],ignore_index=True)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 3,
"outputs": [],
"source": [
"agg_df[\"region\"] = agg_df[\"query\"].apply(lambda x: \"EU+China\" if \"CU\" in x else \"Global\")\n",
"agg_df[\"kw_token\"] = agg_df[\"query\"].apply(lambda x: x.split(\"TS=(\")[-1].split(\")\")[0])\n",
"agg_df[\"kw_token\"] = agg_df[\"kw_token\"].apply(lambda x: \"OR COMPOSITE\" if \" OR \" in x else x)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 4,
"outputs": [],
"source": [
"agg_df = agg_df[~agg_df[\"Record Count\"].isna()]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 5,
"outputs": [
{
"data": {
"text/plain": " query Record Count\n0 CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST... 972.0\n1 CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST... 451.0\n2 CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST... 30.0\n3 CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST... 12.0\n4 CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST... 5.0\n.. ... ...\n384 TS=(\"word embedding*\") AND PY=(2011-2022) 7068.0\n385 TS=(\"word vector*\") AND PY=(2011-2022) 1747.0\n386 TS=((\"face recognition\" NOT \"brain\")) AND PY=(... 19690.0\n387 TS=((\"object detection\" NOT \"brain\")) AND PY=(... 28989.0\n388 TS=((\"speech recognition\" NOT \"brain\")) AND PY... 19912.0\n\n[389 rows x 2 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>query</th>\n <th>Record Count</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...</td>\n <td>972.0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...</td>\n <td>451.0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...</td>\n <td>30.0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...</td>\n <td>12.0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...</td>\n <td>5.0</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>384</th>\n <td>TS=(\"word embedding*\") AND PY=(2011-2022)</td>\n <td>7068.0</td>\n </tr>\n <tr>\n <th>385</th>\n <td>TS=(\"word vector*\") AND PY=(2011-2022)</td>\n <td>1747.0</td>\n </tr>\n <tr>\n <th>386</th>\n <td>TS=((\"face recognition\" NOT \"brain\")) AND PY=(...</td>\n <td>19690.0</td>\n </tr>\n <tr>\n <th>387</th>\n <td>TS=((\"object detection\" NOT \"brain\")) AND PY=(...</td>\n <td>28989.0</td>\n </tr>\n <tr>\n <th>388</th>\n <td>TS=((\"speech recognition\" NOT \"brain\")) AND PY...</td>\n <td>19912.0</td>\n </tr>\n </tbody>\n</table>\n<p>389 rows × 2 columns</p>\n</div>"
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"agg_df.groupby(\"query\",as_index=False)[\"Record Count\"].sum()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 6,
"outputs": [],
"source": [
"# agg_df = agg_df[agg_df[\"Publication Years\"].str.startswith(\"20\", na=False)].copy()\n",
"# agg_df[\"Publication Years\"] = agg_df[\"Publication Years\"].astype(int)\n",
"# agg_df[((agg_df[\"Publication Years\"]>2010) & (agg_df[\"Publication Years\"]<2023))]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 7,
"outputs": [],
"source": [
"# agg_df[\"Publication Years\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 8,
"outputs": [],
"source": [
"agg_df.to_excel(r'C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\WOS\\wos_processed_data\\query_yearly_agg.xlsx', index=False)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 64,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}