You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ZSI_Reconnect_China/WOS/wos_extract/wos_search_kw_analysis.ipynb

191 lines
9.4 KiB
Plaintext

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import seaborn as sns\n",
"import os\n",
"import matplotlib.pyplot as plt\n",
"from pandas.errors import EmptyDataError"
]
},
{
"cell_type": "code",
"execution_count": 16,
"outputs": [],
"source": [
"agg_df = pd.DataFrame()\n",
"\n",
"workdir_path = 'wos_downloads/aggregated'\n",
"for root, dirs, files in os.walk(workdir_path):\n",
" for filename in files:\n",
" if 'analyze_' in filename:\n",
" path=os.path.join(root, filename)\n",
" with open(os.path.join(root, 'query.txt'),'r') as f:\n",
" query = f.readline()\n",
" try:\n",
" chunk = pd.read_csv(path, sep='\\t')[[\"Publication Years\",\"Record Count\"]]\n",
" except EmptyDataError:\n",
" path=os.path.join(root, \"analyze.txt\")\n",
" chunk = pd.read_csv(path, sep='\\t')[[\"Publication Years\",\"Record Count\"]]\n",
" chunk[\"name\"] = filename.replace(\".txt\",\"\")\n",
" chunk[\"query\"] = query\n",
" agg_df = pd.concat([chunk,agg_df],ignore_index=True)\n",
" # elif len(files)==1:\n"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 17,
"outputs": [],
"source": [
"agg_df[\"region\"] = agg_df[\"query\"].apply(lambda x: \"EU+China\" if \"CU\" in x else \"Global\")\n",
"agg_df[\"kw_token\"] = agg_df[\"query\"].apply(lambda x: x.split(\"TS=(\")[-1].split(\")\")[0].strip(\"(\"))\n",
"agg_df[\"kw_token\"] = agg_df[\"kw_token\"].apply(lambda x: \"COMPOSITE SEARCH\" if \" OR \" in x else x)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 18,
"outputs": [],
"source": [
"agg_df = agg_df[~agg_df[\"Record Count\"].isna()]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 19,
"outputs": [
{
"data": {
"text/plain": " query Record Count\n0 CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST... 972.0\n1 CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST... 451.0\n2 CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST... 30.0\n3 CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST... 12.0\n4 CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST... 5.0\n.. ... ...\n543 TS=((\"face recognition\" NOT \"brain\")) AND PY=(... 19690.0\n544 TS=((\"linear regression\" NOT \"p=\")) AND PY=(20... 91493.0\n545 TS=((\"logistic regression\" NOT \"p=\")) AND PY=(... 171776.0\n546 TS=((\"object detection\" NOT \"brain\")) AND PY=(... 28989.0\n547 TS=((\"speech recognition\" NOT \"brain\")) AND PY... 19912.0\n\n[548 rows x 2 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>query</th>\n <th>Record Count</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...</td>\n <td>972.0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...</td>\n <td>451.0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...</td>\n <td>30.0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...</td>\n <td>12.0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...</td>\n <td>5.0</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>543</th>\n <td>TS=((\"face recognition\" NOT \"brain\")) AND PY=(...</td>\n <td>19690.0</td>\n </tr>\n <tr>\n <th>544</th>\n <td>TS=((\"linear regression\" NOT \"p=\")) AND PY=(20...</td>\n <td>91493.0</td>\n </tr>\n <tr>\n <th>545</th>\n <td>TS=((\"logistic regression\" NOT \"p=\")) AND PY=(...</td>\n <td>171776.0</td>\n </tr>\n <tr>\n <th>546</th>\n <td>TS=((\"object detection\" NOT \"brain\")) AND PY=(...</td>\n <td>28989.0</td>\n </tr>\n <tr>\n <th>547</th>\n <td>TS=((\"speech recognition\" NOT \"brain\")) AND PY...</td>\n <td>19912.0</td>\n </tr>\n </tbody>\n</table>\n<p>548 rows × 2 columns</p>\n</div>"
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"agg_df.groupby(\"query\",as_index=False)[\"Record Count\"].sum()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 29,
"outputs": [
{
"data": {
"text/plain": " kw_token Record Count\n0 COMPOSITE SEARCH 62205.0\n1 \"neural network*\" 10999.0\n2 \"machine* learn*\" 5765.0\n3 \"deep learn*\" 5211.0\n4 \"momentum\" 4974.0\n.. ... ...\n243 \"artificial cognition\" 1.0\n244 \"ai in disaster management\" 1.0\n245 \"vector embedding*\" 1.0\n246 \"ai in finance\" 1.0\n247 \"content based filtering\" 1.0\n\n[248 rows x 2 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>kw_token</th>\n <th>Record Count</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>COMPOSITE SEARCH</td>\n <td>62205.0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>\"neural network*\"</td>\n <td>10999.0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>\"machine* learn*\"</td>\n <td>5765.0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>\"deep learn*\"</td>\n <td>5211.0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>\"momentum\"</td>\n <td>4974.0</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>243</th>\n <td>\"artificial cognition\"</td>\n <td>1.0</td>\n </tr>\n <tr>\n <th>244</th>\n <td>\"ai in disaster management\"</td>\n <td>1.0</td>\n </tr>\n <tr>\n <th>245</th>\n <td>\"vector embedding*\"</td>\n <td>1.0</td>\n </tr>\n <tr>\n <th>246</th>\n <td>\"ai in finance\"</td>\n <td>1.0</td>\n </tr>\n <tr>\n <th>247</th>\n <td>\"content based filtering\"</td>\n <td>1.0</td>\n </tr>\n </tbody>\n</table>\n<p>248 rows × 2 columns</p>\n</div>"
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kw_ranks = agg_df[agg_df[\"region\"]==\"EU+China\"].groupby(\"kw_token\",as_index=False)[\"Record Count\"].sum().sort_values(by=\"Record Count\", ascending=False).reset_index().drop(columns=\"index\")\n",
"kw_ranks"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 30,
"outputs": [],
"source": [
"kw_ranks.to_excel(\"kw_token_ranked.xlsx\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 21,
"outputs": [],
"source": [
"# agg_df = agg_df[agg_df[\"Publication Years\"].str.startswith(\"20\", na=False)].copy()\n",
"# agg_df[\"Publication Years\"] = agg_df[\"Publication Years\"].astype(int)\n",
"# agg_df[((agg_df[\"Publication Years\"]>2010) & (agg_df[\"Publication Years\"]<2023))]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 22,
"outputs": [],
"source": [
"# agg_df[\"Publication Years\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 23,
"outputs": [],
"source": [
"agg_df.to_excel(r'C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\WOS\\wos_processed_data\\query_yearly_agg.xlsx', index=False)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 64,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}