{ "cells": [ { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import seaborn as sns\n", "import os\n", "import matplotlib.pyplot as plt\n", "from pandas.errors import EmptyDataError" ] }, { "cell_type": "code", "execution_count": 16, "outputs": [], "source": [ "agg_df = pd.DataFrame()\n", "\n", "workdir_path = 'wos_downloads/aggregated'\n", "for root, dirs, files in os.walk(workdir_path):\n", " for filename in files:\n", " if 'analyze_' in filename:\n", " path=os.path.join(root, filename)\n", " with open(os.path.join(root, 'query.txt'),'r') as f:\n", " query = f.readline()\n", " try:\n", " chunk = pd.read_csv(path, sep='\\t')[[\"Publication Years\",\"Record Count\"]]\n", " except EmptyDataError:\n", " path=os.path.join(root, \"analyze.txt\")\n", " chunk = pd.read_csv(path, sep='\\t')[[\"Publication Years\",\"Record Count\"]]\n", " chunk[\"name\"] = filename.replace(\".txt\",\"\")\n", " chunk[\"query\"] = query\n", " agg_df = pd.concat([chunk,agg_df],ignore_index=True)\n", " # elif len(files)==1:\n" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 17, "outputs": [], "source": [ "agg_df[\"region\"] = agg_df[\"query\"].apply(lambda x: \"EU+China\" if \"CU\" in x else \"Global\")\n", "agg_df[\"kw_token\"] = agg_df[\"query\"].apply(lambda x: x.split(\"TS=(\")[-1].split(\")\")[0].strip(\"(\"))\n", "agg_df[\"kw_token\"] = agg_df[\"kw_token\"].apply(lambda x: \"COMPOSITE SEARCH\" if \" OR \" in x else x)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 18, "outputs": [], "source": [ "agg_df = agg_df[~agg_df[\"Record Count\"].isna()]" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 19, "outputs": [ { "data": { "text/plain": " query Record Count\n0 CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST... 972.0\n1 CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST... 451.0\n2 CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST... 30.0\n3 CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST... 12.0\n4 CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST... 5.0\n.. ... ...\n543 TS=((\"face recognition\" NOT \"brain\")) AND PY=(... 19690.0\n544 TS=((\"linear regression\" NOT \"p=\")) AND PY=(20... 91493.0\n545 TS=((\"logistic regression\" NOT \"p=\")) AND PY=(... 171776.0\n546 TS=((\"object detection\" NOT \"brain\")) AND PY=(... 28989.0\n547 TS=((\"speech recognition\" NOT \"brain\")) AND PY... 19912.0\n\n[548 rows x 2 columns]", "text/html": "
\n | query | \nRecord Count | \n
---|---|---|
0 | \nCU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST... | \n972.0 | \n
1 | \nCU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST... | \n451.0 | \n
2 | \nCU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST... | \n30.0 | \n
3 | \nCU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST... | \n12.0 | \n
4 | \nCU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUST... | \n5.0 | \n
... | \n... | \n... | \n
543 | \nTS=((\"face recognition\" NOT \"brain\")) AND PY=(... | \n19690.0 | \n
544 | \nTS=((\"linear regression\" NOT \"p=\")) AND PY=(20... | \n91493.0 | \n
545 | \nTS=((\"logistic regression\" NOT \"p=\")) AND PY=(... | \n171776.0 | \n
546 | \nTS=((\"object detection\" NOT \"brain\")) AND PY=(... | \n28989.0 | \n
547 | \nTS=((\"speech recognition\" NOT \"brain\")) AND PY... | \n19912.0 | \n
548 rows × 2 columns
\n\n | kw_token | \nRecord Count | \n
---|---|---|
0 | \nCOMPOSITE SEARCH | \n62205.0 | \n
1 | \n\"neural network*\" | \n10999.0 | \n
2 | \n\"machine* learn*\" | \n5765.0 | \n
3 | \n\"deep learn*\" | \n5211.0 | \n
4 | \n\"momentum\" | \n4974.0 | \n
... | \n... | \n... | \n
243 | \n\"artificial cognition\" | \n1.0 | \n
244 | \n\"ai in disaster management\" | \n1.0 | \n
245 | \n\"vector embedding*\" | \n1.0 | \n
246 | \n\"ai in finance\" | \n1.0 | \n
247 | \n\"content based filtering\" | \n1.0 | \n
248 rows × 2 columns
\n