{ "cells": [ { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import seaborn as sns\n", "import os\n", "import matplotlib.pyplot as plt\n", "from pandas.errors import EmptyDataError" ] }, { "cell_type": "code", "execution_count": 16, "outputs": [], "source": [ "agg_df = pd.DataFrame()\n", "\n", "workdir_path = 'wos_downloads/aggregated'\n", "for root, dirs, files in os.walk(workdir_path):\n", " for filename in files:\n", " if 'analyze_' in filename:\n", " path=os.path.join(root, filename)\n", " with open(os.path.join(root, 'query.txt'),'r') as f:\n", " query = f.readline()\n", " try:\n", " chunk = pd.read_csv(path, sep='\\t')[[\"Publication Years\",\"Record Count\"]]\n", " except EmptyDataError:\n", " path=os.path.join(root, \"analyze.txt\")\n", " chunk = pd.read_csv(path, sep='\\t')[[\"Publication Years\",\"Record Count\"]]\n", " chunk[\"name\"] = filename.replace(\".txt\",\"\")\n", " chunk[\"query\"] = query\n", " agg_df = pd.concat([chunk,agg_df],ignore_index=True)\n", " # elif len(files)==1:\n" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 17, "outputs": [], "source": [ "agg_df[\"region\"] = agg_df[\"query\"].apply(lambda x: \"EU+China\" if \"CU\" in x else \"Global\")\n", "agg_df[\"kw_token\"] = agg_df[\"query\"].apply(lambda x: x.split(\"TS=(\")[-1].split(\")\")[0].strip(\"(\"))\n", "agg_df[\"kw_token\"] = agg_df[\"kw_token\"].apply(lambda x: \"COMPOSITE SEARCH\" if \" OR \" in x else x)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 18, "outputs": [], "source": [ "agg_df = agg_df[~agg_df[\"Record Count\"].isna()]" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 19, "outputs": [ { "data": { "text/plain": " query Record Count\n0 CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST... 972.0\n1 CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST... 451.0\n2 CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST... 30.0\n3 CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST... 12.0\n4 CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST... 5.0\n.. ... ...\n543 TS=((\"face recognition\" NOT \"brain\")) AND PY=(... 19690.0\n544 TS=((\"linear regression\" NOT \"p=\")) AND PY=(20... 91493.0\n545 TS=((\"logistic regression\" NOT \"p=\")) AND PY=(... 171776.0\n546 TS=((\"object detection\" NOT \"brain\")) AND PY=(... 28989.0\n547 TS=((\"speech recognition\" NOT \"brain\")) AND PY... 19912.0\n\n[548 rows x 2 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
queryRecord Count
0CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...972.0
1CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...451.0
2CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...30.0
3CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...12.0
4CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST...5.0
.........
543TS=((\"face recognition\" NOT \"brain\")) AND PY=(...19690.0
544TS=((\"linear regression\" NOT \"p=\")) AND PY=(20...91493.0
545TS=((\"logistic regression\" NOT \"p=\")) AND PY=(...171776.0
546TS=((\"object detection\" NOT \"brain\")) AND PY=(...28989.0
547TS=((\"speech recognition\" NOT \"brain\")) AND PY...19912.0
\n

548 rows × 2 columns

\n
" }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "agg_df.groupby(\"query\",as_index=False)[\"Record Count\"].sum()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 29, "outputs": [ { "data": { "text/plain": " kw_token Record Count\n0 COMPOSITE SEARCH 62205.0\n1 \"neural network*\" 10999.0\n2 \"machine* learn*\" 5765.0\n3 \"deep learn*\" 5211.0\n4 \"momentum\" 4974.0\n.. ... ...\n243 \"artificial cognition\" 1.0\n244 \"ai in disaster management\" 1.0\n245 \"vector embedding*\" 1.0\n246 \"ai in finance\" 1.0\n247 \"content based filtering\" 1.0\n\n[248 rows x 2 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
kw_tokenRecord Count
0COMPOSITE SEARCH62205.0
1\"neural network*\"10999.0
2\"machine* learn*\"5765.0
3\"deep learn*\"5211.0
4\"momentum\"4974.0
.........
243\"artificial cognition\"1.0
244\"ai in disaster management\"1.0
245\"vector embedding*\"1.0
246\"ai in finance\"1.0
247\"content based filtering\"1.0
\n

248 rows × 2 columns

\n
" }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kw_ranks = agg_df[agg_df[\"region\"]==\"EU+China\"].groupby(\"kw_token\",as_index=False)[\"Record Count\"].sum().sort_values(by=\"Record Count\", ascending=False).reset_index().drop(columns=\"index\")\n", "kw_ranks" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 30, "outputs": [], "source": [ "kw_ranks.to_excel(\"kw_token_ranked.xlsx\")" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 21, "outputs": [], "source": [ "# agg_df = agg_df[agg_df[\"Publication Years\"].str.startswith(\"20\", na=False)].copy()\n", "# agg_df[\"Publication Years\"] = agg_df[\"Publication Years\"].astype(int)\n", "# agg_df[((agg_df[\"Publication Years\"]>2010) & (agg_df[\"Publication Years\"]<2023))]" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 22, "outputs": [], "source": [ "# agg_df[\"Publication Years\"].value_counts()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 23, "outputs": [], "source": [ "agg_df.to_excel(r'C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\WOS\\wos_processed_data\\query_yearly_agg.xlsx', index=False)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 64, "outputs": [], "source": [], "metadata": { "collapsed": false } } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 0 }