wos_processing_pipeline.ipynb minor update, addresses are now properly exploded, updated query keywords + searchresult analysis demo

2 years ago · 904710e47d
parent c1e72fb904
commit 904710e47d
10 changed files with 1824 additions and 192 deletions
--- a/PBI/ZSI.pbix
+++ b/PBI/ZSI.pbix
--- a/WOS/ai_scope_keywords.txt
+++ b/WOS/ai_scope_keywords.txt
@ -9,7 +9,7 @@ image classification,
 reinforcement learning,
 support vector machine*,
 recommender system*,
-random forest,
+random forest*,
 ensemble model*,
 image processing,
 generative network*,
@ -29,7 +29,7 @@ convolutional network*,
 convolutional neural,
 adversarial network*,
 adversarial neural,
-adversarial machine,
+adversarial machine*,
 autoencoder*,
 gated recurrent unit*,
 perceptron*,
@ -42,7 +42,7 @@ gradient descent,
 k-nearest neighbor*,
 naive bayes,
 transfer learning,
-fuzzy logic,
+fuzzy logic*,
 backpropagation,
 computational modeling,
 computational statistic*,
@ -79,8 +79,8 @@ deep belief network*,
 quantum machine learning,
 artificial immune system*,
 swarm robotics,
-autonomous agents,
-machine ethics,
+autonomous agent*,
+machine ethic*,
 collaborative filtering,
 content based filtering,
 pervasive computing,
@ -142,9 +142,31 @@ KNN,
 singular value decomposition,
 regularization,
 turing test,
-turing-test,
 computational learning theory,
 backward chaining,
 forward chaining,
 entity annotation,
-entity extraction
+entity extraction,
+scalable computing,
+expectation maximization algorithm*,
+markov chain,
+markov process,
+markov decision process,
+monte carlo method,
+bayesian interference,
+kernel method,
+eigendecomposition,
+eigen decomposition,
+kernel method,
+radial basis function,
+QR decomposition,
+LU decomposition,
+Cholesky decomposition,
+spectral theorem,
+model selection,
+lagrange multiplier,
+convex optimization,
+nonlinear optimization,
+L? regulari*,
+ridge regression,
+gaussian process
--- a/WOS/wos_extract/geckodriver.log
+++ b/WOS/wos_extract/geckodriver.log
--- a/WOS/wos_extract/wos_downloads/aggregated/2023-04-12-10-40-14-335447save/analyze_scope.txt
+++ b/WOS/wos_extract/wos_downloads/aggregated/2023-04-12-10-40-14-335447save/analyze_scope.txt
@ -0,0 +1,13 @@
+Publication Years	Record Count	% of 45 355
+2022	9081	20.022
+2021	8630	19.028
+2020	6800	14.993
+2019	5502	12.131
+2018	4087	9.011
+2017	2816	6.209
+2016	2338	5.155
+2015	1818	4.008
+2014	1571	3.464
+2013	1135	2.502
+2012	863	1.903
+2011	714	1.574
--- a/WOS/wos_extract/wos_query_generator_simplesyntax.ipynb
+++ b/WOS/wos_extract/wos_query_generator_simplesyntax.ipynb
--- a/WOS/wos_extract/wos_search_kw_analysis.ipynb
+++ b/WOS/wos_extract/wos_search_kw_analysis.ipynb
@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 59,
+   "execution_count": 72,
   "metadata": {
    "collapsed": true
   },
@ -16,7 +16,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 60,
+   "execution_count": 73,
   "outputs": [],
   "source": [
    "agg_df = pd.DataFrame()\n",
@ -39,11 +39,12 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 61,
+   "execution_count": 74,
   "outputs": [],
   "source": [
    "agg_df[\"region\"] = agg_df[\"query\"].apply(lambda x: \"EU+China\" if \"CU\" in x else \"Global\")\n",
-    "agg_df[\"kw_token\"] = agg_df[\"query\"].apply(lambda x: x.split(\"TS=(\")[-1].split(\")\")[0])"
+    "agg_df[\"kw_token\"] = agg_df[\"query\"].apply(lambda x: x.split(\"TS=(\")[-1].split(\")\")[0])\n",
+    "agg_df[\"kw_token\"] = agg_df[\"kw_token\"].apply(lambda x: \"OR COMPOSITE\" if \" OR \" in x else x)"
   ],
   "metadata": {
    "collapsed": false
@ -51,9 +52,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 61,
+   "execution_count": 83,
   "outputs": [],
-   "source": [],
+   "source": [
+    "agg_df = agg_df[~agg_df[\"Record Count\"].isna()]"
+   ],
   "metadata": {
    "collapsed": false
   }
@ -94,13 +97,13 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 64,
+   "execution_count": 84,
   "outputs": [
    {
     "data": {
-      "text/plain": "Publication Years\n2022                                                                    268\n2021                                                                    260\n2019                                                                    258\n2020                                                                    258\n2018                                                                    250\n2017                                                                    243\n2016                                                                    237\n2015                                                                    227\n2014                                                                    215\n2013                                                                    208\n2012                                                                    193\n2011                                                                    184\n2023                                                                     44\n2014                                                                      4\n2019                                                                      4\n2017                                                                      4\n2018                                                                      4\n2020                                                                      4\n2022                                                                      4\n2021                                                                      4\n2016                                                                      3\n2015                                                                      3\n2013                                                                      3\n2012                                                                      3\n2011                                                                      3\n2023                                                                      2\nShowing 25 out of 29 entries                                              1\nShowing 25 out of 205 entries                                             1\n8 record(s) (0.025%) do not contain data in the field being analyzed      1\nShowing 25 out of 85 entries                                              1\nShowing 25 out of 189 entries                                             1\n1 record(s) (0.011%) do not contain data in the field being analyzed      1\nName: count, dtype: int64"
+      "text/plain": "Publication Years\n2022    314\n2019    305\n2021    305\n2020    302\n2018    296\n2017    287\n2016    281\n2015    271\n2014    258\n2013    251\n2012    233\n2011    224\n2023     52\n2017      4\n2014      4\n2019      4\n2021      4\n2018      4\n2020      4\n2022      4\n2016      3\n2015      3\n2013      3\n2012      3\n2011      3\n2023      2\nName: count, dtype: int64"
     },
-     "execution_count": 64,
+     "execution_count": 84,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -123,7 +126,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 65,
+   "execution_count": 85,
   "outputs": [],
   "source": [
    "agg_df.to_excel(r'C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\WOS\\wos_processed_data\\query_yearly_agg.xlsx', index=False)"
--- a/WOS/wos_processed_data/query_yearly_agg.xlsx
+++ b/WOS/wos_processed_data/query_yearly_agg.xlsx
--- a/WOS/wos_processed_data/wos_author_locations.xlsx
+++ b/WOS/wos_processed_data/wos_author_locations.xlsx
--- a/WOS/wos_processed_data/wos_processed.xlsx
+++ b/WOS/wos_processed_data/wos_processed.xlsx
--- a/WOS/wos_processing_pipeline.ipynb
+++ b/WOS/wos_processing_pipeline.ipynb