added wos selenium crawler slightly updated WOS data processing

2 years ago · da720a6131
parent edf23fbcda
commit da720a6131
3 changed files with 855 additions and 120 deletions
--- a/WOS/wos_extract/wos_query_generator.ipynb
+++ b/WOS/wos_extract/wos_query_generator.ipynb
@ -2,19 +2,21 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 50,
+   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
+    "import os\n",
+    "\n",
    "import pandas as pd\n",
    "focal_countries_list = [\"Peoples R china\", \"Hong Kong\"]"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 51,
+   "execution_count": 2,
   "outputs": [],
   "source": [
    "country_mode = \"CU\" #CU-country-region AU-address"
@ -28,7 +30,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 52,
+   "execution_count": 3,
   "outputs": [],
   "source": [
    "# (TS=(\"artificial intelligence\") OR TS=(\"machine learning\") OR TS=(\"neural network\") OR TS=(\"big data\") OR TS=(\"deep learning\") OR TS=(\"computer vision\") OR TS=(\"pattern recognition\")) AND"
@ -42,13 +44,13 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 53,
+   "execution_count": 4,
   "outputs": [
    {
     "data": {
      "text/plain": "'TS=(\"artificial intelligence\") OR TS=(\"machine learning\") OR TS=(\"neural network\") OR TS=(\"big data\") OR TS=(\"deep learning\") OR TS=(\"pattern recognition\") OR TS=(\"computer vision\") OR TS=(\"image classification\") OR TS=(\"reinforcement learning\") OR TS=(\"support vector machines\") OR TS=(\"recommender system\") OR TS=(\"random forest\") OR TS=(\"ensemble model\") OR TS=(\"image processing\") OR TS=(\"generative network\") OR TS=(\"ai ethic\") OR TS=(\"natural language processing\") OR TS=(\"clustering algorithm\") OR TS=(\"feature extraction\") OR TS=(\"time series forecast\") OR TS=(\"anomaly detection\") OR TS=(\"identity fraud detection\") OR TS=(\"dimensionality reduction\") OR TS=(\"feature elicitation\") OR TS=(\"chatbot\") OR TS=(\"clustering\") OR TS=(\"unsupervised learning\") OR TS=(\"supervised learning\") OR TS=(\"convolutional network\") OR TS=(\"adversarial network\")'"
     },
-     "execution_count": 53,
+     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -60,7 +62,7 @@
    "\n",
    "keywords = [c.strip() for c in keywords[0].split(\",\")]\n",
    "\n",
-    "keywords_str = ' OR '.join('TS=(\"'+k+'\")' for k in keywords)\n",
+    "keywords_str = ' OR '.join('TS=(\\\"'+k+'\\\")' for k in keywords)\n",
    "keywords_str"
   ],
   "metadata": {
@ -72,17 +74,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 54,
-   "outputs": [
-    {
-     "data": {
-      "text/plain": "'CU=AUSTRIA OR CU=BELGIUM OR CU=BULGARIA OR CU=CROATIA OR CU=CYPRUS OR CU=CZECH REPUBLIC OR CU=DENMARK OR CU=ESTONIA OR CU=FINLAND OR CU=FRANCE OR CU=GERMANY OR CU=GREECE OR CU=HUNGARY OR CU=IRELAND OR CU=ITALY OR CU=LATVIA OR CU=LITHUANIA OR CU=LUXEMBOURG OR CU=MALTA OR CU=NETHERLANDS OR CU=POLAND OR CU=PORTUGAL OR CU=ROMANIA OR CU=SLOVAKIA OR CU=SLOVENIA OR CU=SPAIN OR CU=SWEDEN OR CU=NORWAY OR CU=SWITZERLAND OR CU=UNITED KINGDOM OR CU=ENGLAND OR CU=WALES OR CU=SCOTLAND OR CU=N IRELAND'"
-     },
-     "execution_count": 54,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": 5,
+   "outputs": [],
   "source": [
    "scope_country_source = r'..\\eu_scope_countries.txt'\n",
    "\n",
@ -90,11 +83,23 @@
    "    coop_countries = f.readlines()\n",
    "coop_countries = [c.strip().upper() for c in coop_countries[0].split(\",\")]\n",
    "focal_countries = [c.strip().upper() for c in focal_countries_list]\n",
+    "eu_countries = coop_countries[0:-7]\n",
+    "assoc_countries = coop_countries[-7:]\n",
+    "\n",
+    "nor_c = [coop_countries[-7],]\n",
+    "swi_c = [coop_countries[-6],]\n",
+    "uk_c = coop_countries[-5:]\n",
    "\n",
    "foc_str = ' OR '.join([country_mode+'='+c for c in focal_countries])\n",
    "coop_str = ' OR '.join([country_mode+'='+c for c in coop_countries])\n",
+    "eu_str = ' OR '.join([country_mode+'='+c for c in eu_countries])\n",
+    "assoc_str = ' OR '.join([country_mode+'='+c for c in assoc_countries])\n",
    "\n",
-    "coop_str"
+    "nor_str =' OR '.join([country_mode+'='+c for c in nor_c])\n",
+    "swi_str =' OR '.join([country_mode+'='+c for c in swi_c])\n",
+    "uk_str =' OR '.join([country_mode+'='+c for c in uk_c])\n",
+    "eu_sub_str = eu_str.split(' OR ')\n",
+    "# eu_sub_str"
   ],
   "metadata": {
    "collapsed": false,
@ -105,13 +110,48 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 55,
+   "execution_count": 5,
+   "outputs": [],
+   "source": [],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "['UNITED KINGDOM', 'ENGLAND', 'WALES', 'SCOTLAND', 'N IRELAND']"
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "coop_countries[-5:]"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
   "outputs": [
    {
     "data": {
      "text/plain": "'CU=PEOPLES R CHINA OR CU=HONG KONG'"
     },
-     "execution_count": 55,
+     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -128,19 +168,19 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 58,
+   "execution_count": 8,
   "outputs": [
    {
     "data": {
-      "text/plain": "'(CU=PEOPLES R CHINA OR CU=HONG KONG) AND (CU=AUSTRIA OR CU=BELGIUM OR CU=BULGARIA OR CU=CROATIA OR CU=CYPRUS OR CU=CZECH REPUBLIC OR CU=DENMARK OR CU=ESTONIA OR CU=FINLAND OR CU=FRANCE OR CU=GERMANY OR CU=GREECE OR CU=HUNGARY OR CU=IRELAND OR CU=ITALY OR CU=LATVIA OR CU=LITHUANIA OR CU=LUXEMBOURG OR CU=MALTA OR CU=NETHERLANDS OR CU=POLAND OR CU=PORTUGAL OR CU=ROMANIA OR CU=SLOVAKIA OR CU=SLOVENIA OR CU=SPAIN OR CU=SWEDEN OR CU=NORWAY OR CU=SWITZERLAND OR CU=UNITED KINGDOM OR CU=ENGLAND OR CU=WALES OR CU=SCOTLAND OR CU=N IRELAND) AND (TS=(\"artificial intelligence\") OR TS=(\"machine learning\") OR TS=(\"neural network\") OR TS=(\"big data\") OR TS=(\"deep learning\") OR TS=(\"pattern recognition\") OR TS=(\"computer vision\") OR TS=(\"image classification\") OR TS=(\"reinforcement learning\") OR TS=(\"support vector machines\") OR TS=(\"recommender system\") OR TS=(\"random forest\") OR TS=(\"ensemble model\") OR TS=(\"image processing\") OR TS=(\"generative network\") OR TS=(\"ai ethic\") OR TS=(\"natural language processing\") OR TS=(\"clustering algorithm\") OR TS=(\"feature extraction\") OR TS=(\"time series forecast\") OR TS=(\"anomaly detection\") OR TS=(\"identity fraud detection\") OR TS=(\"dimensionality reduction\") OR TS=(\"feature elicitation\") OR TS=(\"chatbot\") OR TS=(\"clustering\") OR TS=(\"unsupervised learning\") OR TS=(\"supervised learning\") OR TS=(\"convolutional network\") OR TS=(\"adversarial network\"))'"
+      "text/plain": "'(CU=PEOPLES R CHINA OR CU=HONG KONG) AND (CU=AUSTRIA OR CU=BELGIUM OR CU=BULGARIA OR CU=CROATIA OR CU=CYPRUS OR CU=CZECH REPUBLIC OR CU=DENMARK OR CU=ESTONIA OR CU=FINLAND OR CU=FRANCE OR CU=GERMANY OR CU=GREECE OR CU=HUNGARY OR CU=IRELAND OR CU=ITALY OR CU=LATVIA OR CU=LITHUANIA OR CU=LUXEMBOURG OR CU=MALTA OR CU=NETHERLANDS OR CU=POLAND OR CU=PORTUGAL OR CU=ROMANIA OR CU=SLOVAKIA OR CU=SLOVENIA OR CU=SPAIN OR CU=SWEDEN OR CU=NORWAY OR CU=SWITZERLAND OR CU=UNITED KINGDOM OR CU=ENGLAND OR CU=WALES OR CU=SCOTLAND OR CU=N IRELAND) AND (TS=(\"artificial intelligence\") OR TS=(\"machine learning\") OR TS=(\"neural network\") OR TS=(\"big data\") OR TS=(\"deep learning\") OR TS=(\"pattern recognition\") OR TS=(\"computer vision\") OR TS=(\"image classification\") OR TS=(\"reinforcement learning\") OR TS=(\"support vector machines\") OR TS=(\"recommender system\") OR TS=(\"random forest\") OR TS=(\"ensemble model\") OR TS=(\"image processing\") OR TS=(\"generative network\") OR TS=(\"ai ethic\") OR TS=(\"natural language processing\") OR TS=(\"clustering algorithm\") OR TS=(\"feature extraction\") OR TS=(\"time series forecast\") OR TS=(\"anomaly detection\") OR TS=(\"identity fraud detection\") OR TS=(\"dimensionality reduction\") OR TS=(\"feature elicitation\") OR TS=(\"chatbot\") OR TS=(\"clustering\") OR TS=(\"unsupervised learning\") OR TS=(\"supervised learning\") OR TS=(\"convolutional network\") OR TS=(\"adversarial network\")) AND PY=(2011-2022)'"
     },
-     "execution_count": 58,
+     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "scope_query = f'({foc_str}) AND ({coop_str}) AND ({keywords_str})'\n",
+    "scope_query = f'({foc_str}) AND ({coop_str}) AND ({keywords_str}) AND PY=(2011-2022)'\n",
    "scope_query"
   ],
   "metadata": {
@ -152,19 +192,19 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 60,
+   "execution_count": 9,
   "outputs": [
    {
     "data": {
-      "text/plain": "'(CU=AUSTRIA OR CU=BELGIUM OR CU=BULGARIA OR CU=CROATIA OR CU=CYPRUS OR CU=CZECH REPUBLIC OR CU=DENMARK OR CU=ESTONIA OR CU=FINLAND OR CU=FRANCE OR CU=GERMANY OR CU=GREECE OR CU=HUNGARY OR CU=IRELAND OR CU=ITALY OR CU=LATVIA OR CU=LITHUANIA OR CU=LUXEMBOURG OR CU=MALTA OR CU=NETHERLANDS OR CU=POLAND OR CU=PORTUGAL OR CU=ROMANIA OR CU=SLOVAKIA OR CU=SLOVENIA OR CU=SPAIN OR CU=SWEDEN OR CU=NORWAY OR CU=SWITZERLAND OR CU=UNITED KINGDOM OR CU=ENGLAND OR CU=WALES OR CU=SCOTLAND OR CU=N IRELAND) AND (TS=(\"artificial intelligence\") OR TS=(\"machine learning\") OR TS=(\"neural network\") OR TS=(\"big data\") OR TS=(\"deep learning\") OR TS=(\"pattern recognition\") OR TS=(\"computer vision\") OR TS=(\"image classification\") OR TS=(\"reinforcement learning\") OR TS=(\"support vector machines\") OR TS=(\"recommender system\") OR TS=(\"random forest\") OR TS=(\"ensemble model\") OR TS=(\"image processing\") OR TS=(\"generative network\") OR TS=(\"ai ethic\") OR TS=(\"natural language processing\") OR TS=(\"clustering algorithm\") OR TS=(\"feature extraction\") OR TS=(\"time series forecast\") OR TS=(\"anomaly detection\") OR TS=(\"identity fraud detection\") OR TS=(\"dimensionality reduction\") OR TS=(\"feature elicitation\") OR TS=(\"chatbot\") OR TS=(\"clustering\") OR TS=(\"unsupervised learning\") OR TS=(\"supervised learning\") OR TS=(\"convolutional network\") OR TS=(\"adversarial network\"))'"
+      "text/plain": "'(CU=PEOPLES R CHINA OR CU=HONG KONG) AND (TS=(\"artificial intelligence\") OR TS=(\"machine learning\") OR TS=(\"neural network\") OR TS=(\"big data\") OR TS=(\"deep learning\") OR TS=(\"pattern recognition\") OR TS=(\"computer vision\") OR TS=(\"image classification\") OR TS=(\"reinforcement learning\") OR TS=(\"support vector machines\") OR TS=(\"recommender system\") OR TS=(\"random forest\") OR TS=(\"ensemble model\") OR TS=(\"image processing\") OR TS=(\"generative network\") OR TS=(\"ai ethic\") OR TS=(\"natural language processing\") OR TS=(\"clustering algorithm\") OR TS=(\"feature extraction\") OR TS=(\"time series forecast\") OR TS=(\"anomaly detection\") OR TS=(\"identity fraud detection\") OR TS=(\"dimensionality reduction\") OR TS=(\"feature elicitation\") OR TS=(\"chatbot\") OR TS=(\"clustering\") OR TS=(\"unsupervised learning\") OR TS=(\"supervised learning\") OR TS=(\"convolutional network\") OR TS=(\"adversarial network\"))'"
     },
-     "execution_count": 60,
+     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "ch_scope_query = f'({coop_str}) AND ({keywords_str})'\n",
+    "ch_scope_query = f'({foc_str}) AND ({keywords_str})'\n",
    "ch_scope_query"
   ],
   "metadata": {
@ -173,6 +213,140 @@
     "name": "#%%\n"
    }
   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "'(CU=AUSTRIA OR CU=BELGIUM OR CU=BULGARIA OR CU=CROATIA OR CU=CYPRUS OR CU=CZECH REPUBLIC OR CU=DENMARK OR CU=ESTONIA OR CU=FINLAND OR CU=FRANCE OR CU=GERMANY OR CU=GREECE OR CU=HUNGARY OR CU=IRELAND OR CU=ITALY OR CU=LATVIA OR CU=LITHUANIA OR CU=LUXEMBOURG OR CU=MALTA OR CU=NETHERLANDS OR CU=POLAND OR CU=PORTUGAL OR CU=ROMANIA OR CU=SLOVAKIA OR CU=SLOVENIA OR CU=SPAIN OR CU=SWEDEN) AND (TS=(\"artificial intelligence\") OR TS=(\"machine learning\") OR TS=(\"neural network\") OR TS=(\"big data\") OR TS=(\"deep learning\") OR TS=(\"pattern recognition\") OR TS=(\"computer vision\") OR TS=(\"image classification\") OR TS=(\"reinforcement learning\") OR TS=(\"support vector machines\") OR TS=(\"recommender system\") OR TS=(\"random forest\") OR TS=(\"ensemble model\") OR TS=(\"image processing\") OR TS=(\"generative network\") OR TS=(\"ai ethic\") OR TS=(\"natural language processing\") OR TS=(\"clustering algorithm\") OR TS=(\"feature extraction\") OR TS=(\"time series forecast\") OR TS=(\"anomaly detection\") OR TS=(\"identity fraud detection\") OR TS=(\"dimensionality reduction\") OR TS=(\"feature elicitation\") OR TS=(\"chatbot\") OR TS=(\"clustering\") OR TS=(\"unsupervised learning\") OR TS=(\"supervised learning\") OR TS=(\"convolutional network\") OR TS=(\"adversarial network\"))'"
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "eu_scope_query = f'({eu_str}) AND ({keywords_str})'\n",
+    "eu_scope_query"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "outputs": [],
+   "source": [
+    "sub_queries = [f'PY=(2011-2022) AND ({i_str}) AND ({keywords_str})' for i_str in [foc_str,eu_str,assoc_str,nor_str,swi_str,uk_str]+eu_sub_str]"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "outputs": [],
+   "source": [
+    "from wossel_miners import wos_fetch_entries,wos_fetch_yearly_output"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 33/33 [12:49<00:00, 23.31s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "wos_fetch_yearly_output(query_str_list=sub_queries)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "'(CU=PEOPLES R CHINA OR CU=HONG KONG) AND (CU=AUSTRIA OR CU=BELGIUM OR CU=BULGARIA OR CU=CROATIA OR CU=CYPRUS OR CU=CZECH REPUBLIC OR CU=DENMARK OR CU=ESTONIA OR CU=FINLAND OR CU=FRANCE OR CU=GERMANY OR CU=GREECE OR CU=HUNGARY OR CU=IRELAND OR CU=ITALY OR CU=LATVIA OR CU=LITHUANIA OR CU=LUXEMBOURG OR CU=MALTA OR CU=NETHERLANDS OR CU=POLAND OR CU=PORTUGAL OR CU=ROMANIA OR CU=SLOVAKIA OR CU=SLOVENIA OR CU=SPAIN OR CU=SWEDEN OR CU=NORWAY OR CU=SWITZERLAND OR CU=UNITED KINGDOM OR CU=ENGLAND OR CU=WALES OR CU=SCOTLAND OR CU=N IRELAND) AND (TS=(\"artificial intelligence\") OR TS=(\"machine learning\") OR TS=(\"neural network\") OR TS=(\"big data\") OR TS=(\"deep learning\") OR TS=(\"pattern recognition\") OR TS=(\"computer vision\") OR TS=(\"image classification\") OR TS=(\"reinforcement learning\") OR TS=(\"support vector machines\") OR TS=(\"recommender system\") OR TS=(\"random forest\") OR TS=(\"ensemble model\") OR TS=(\"image processing\") OR TS=(\"generative network\") OR TS=(\"ai ethic\") OR TS=(\"natural language processing\") OR TS=(\"clustering algorithm\") OR TS=(\"feature extraction\") OR TS=(\"time series forecast\") OR TS=(\"anomaly detection\") OR TS=(\"identity fraud detection\") OR TS=(\"dimensionality reduction\") OR TS=(\"feature elicitation\") OR TS=(\"chatbot\") OR TS=(\"clustering\") OR TS=(\"unsupervised learning\") OR TS=(\"supervised learning\") OR TS=(\"convolutional network\") OR TS=(\"adversarial network\")) AND PY=(2011-2022)'"
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "scope_query"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Hoooold...\n",
+      "27672 records found! Here we go in 93 steps...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 92/92 [09:38<00:00,  6.29s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "final batch of 27601-27672\n"
+     ]
+    }
+   ],
+   "source": [
+    "wos_fetch_entries(query_str=scope_query)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
  }
 ],
 "metadata": {
--- a/WOS/wos_extract/wossel_miners.py
+++ b/WOS/wos_extract/wossel_miners.py
@ -0,0 +1,266 @@
+import os
+import glob
+import pytest
+import time
+from datetime import datetime
+import json
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.action_chains import ActionChains
+from selenium.webdriver.support import expected_conditions
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
+# from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.firefox.options import Options
+
+from tqdm import tqdm
+import random
+
+def close_pendo_windows(driver):
+    '''Close guiding windows'''
+    # Cookies
+    try:
+        driver.find_element(By.XPATH, '//*[@id="onetrust-accept-btn-handler"]').click()
+    except:
+        pass
+    # "Got it"
+    try:
+        driver.find_element(By.XPATH, '//button[contains(@class, "_pendo-button-primaryButton")]').click()
+    except:
+        pass
+    # "No thanks"
+    try:
+        driver.find_element(By.XPATH, '//button[contains(@class, "_pendo-button-secondaryButton")]').click()
+    except:
+        pass
+    # What was it... I forgot...
+    try:
+        driver.find_element(By.XPATH, '//span[contains(@class, "_pendo-close-guide")').click()
+    except:
+        pass
+    # Overlay
+    try:
+        driver.find_element(By.XPATH, '//div[contains(@class, "cdk-overlay-container")').click()
+    except:
+        pass
+
+
+def wos_fetch_entries(query_str="TS=\"web of science\" AND PY=(2008-2010)",
+                      wait_mu=1, wait_sigma=0.2, debug=False):
+
+    now = datetime.now()  # current date and time
+    date_time = now.strftime("%Y-%m-%d-%H-%M-%S-%f")+"save"
+
+    options = Options()
+
+    # init directory
+    download_path = fr'C:\Users\radvanyi\PycharmProjects\ZSI_analytics\WOS\wos_extract\wos_downloads\entry_batches\{date_time}'
+    os.makedirs(download_path, exist_ok=True)
+    files = glob.glob(fr'{download_path}\*')
+    for f in files:
+        os.remove(f)
+
+    options.set_preference("browser.download.folderList", 2)
+    options.set_preference("browser.download.manager.showWhenStarting", False)
+    options.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv/xls")
+    options.set_preference("browser.download.dir", download_path)
+
+    with open(fr'{download_path}\query.txt', "w") as f:
+        f.write(query_str)
+
+    # options.headless = True
+    if debug==False:
+        options.add_argument('--headless')
+    driver = webdriver.Firefox(options=options)
+    driver.get("https://www.webofscience.com/")
+    driver.set_window_size(974, 1040)
+    try:
+        WebDriverWait(driver, 30).until(
+            expected_conditions.visibility_of_element_located((By.ID, "onetrust-reject-all-handler")))
+        driver.find_element(By.ID, "onetrust-reject-all-handler").click()
+    except:
+        close_pendo_windows(driver)
+    WebDriverWait(driver, 30).until(
+        expected_conditions.visibility_of_element_located((By.LINK_TEXT, "Advanced Search")))
+    WebDriverWait(driver, 30).until(
+        expected_conditions.invisibility_of_element_located((By.ID, "onetrust-pc-dark-filter ot-fade-in")))
+
+    print("Hoooold...")
+    time.sleep(2)
+    WebDriverWait(driver, 30).until(expected_conditions.element_to_be_clickable((By.LINK_TEXT, "Advanced Search")))
+    driver.find_element(By.LINK_TEXT, "Advanced Search").click()
+
+    WebDriverWait(driver, 30).until(expected_conditions.element_to_be_clickable((By.ID, "advancedSearchInputArea")))
+    driver.find_element(By.ID, "advancedSearchInputArea").click()
+    driver.find_element(By.ID, "advancedSearchInputArea").send_keys(query_str)
+    driver.find_element(By.CSS_SELECTOR, ".mat-menu-trigger > svg").click()
+    driver.find_element(By.CSS_SELECTOR, ".cdk-focused > span").click()
+
+    WebDriverWait(driver, 30).until(expected_conditions.visibility_of_element_located((By.CSS_SELECTOR, ".brand-blue")))
+    driver.execute_script("window.scrollTo(0,0)")
+    count_str = driver.find_element(By.CSS_SELECTOR, ".brand-blue").text
+    count_int = int(count_str.replace(",", "").replace(".", "").strip())
+    print(f'{count_int} records found! Here we go in {int(count_int / 300) + 1} steps...')
+    for i in tqdm(range(1, count_int - 300, 300), position=0, leave=True):
+        # print(f'records {i}-{i+299}')
+        if i == 1:
+            driver.find_element(By.XPATH, "//app-export-menu/div/button").click()
+            # driver.find_element(By.ID, "exportToExcelButton").click()
+            driver.find_element(By.ID, "exportToTabWinButton").click()
+            driver.find_element(By.CSS_SELECTOR, "#radio3 .mat-radio-outer-circle").click()
+            driver.find_element(By.NAME, "markTo").clear()
+            driver.find_element(By.NAME, "markTo").send_keys("300")
+            driver.find_element(By.CSS_SELECTOR, ".margin-top-5 > .dropdown").click()
+            driver.find_element(By.XPATH, "//span[contains(.,\'Full Record\')]").click()
+            driver.find_element(By.CSS_SELECTOR, ".mat-flat-button .ng-star-inserted").click()
+            WebDriverWait(driver, 30).until(expected_conditions.invisibility_of_element_located(
+                (By.CSS_SELECTOR, ".mat-flat-button .ng-star-inserted")))
+            time.sleep(random.gauss(wait_mu, wait_sigma))
+        else:
+            WebDriverWait(driver, 30).until(expected_conditions.invisibility_of_element_located(
+                (By.CSS_SELECTOR, ".mat-flat-button .ng-star-inserted")))
+
+            driver.find_element(By.XPATH, "//app-export-menu/div/button").click()
+            # driver.find_element(By.ID, "exportToExcelButton").click()
+            driver.find_element(By.ID, "exportToTabWinButton").click()
+            driver.find_element(By.CSS_SELECTOR, "#radio3 .mat-radio-container").click()
+            driver.find_element(By.NAME, "markFrom").clear()
+            driver.find_element(By.NAME, "markFrom").send_keys(f"{i}")
+            driver.find_element(By.NAME, "markTo").clear()
+            driver.find_element(By.NAME, "markTo").send_keys(f"{i + 299}")
+            driver.find_element(By.CSS_SELECTOR, ".margin-top-5 > .dropdown").click()
+            driver.find_element(By.XPATH, "//span[contains(.,\'Full Record\')]").click()
+            driver.find_element(By.CSS_SELECTOR, ".mat-flat-button .ng-star-inserted").click()
+
+            WebDriverWait(driver, 30).until(expected_conditions.invisibility_of_element_located(
+                (By.CSS_SELECTOR, ".mat-flat-button .ng-star-inserted")))
+            time.sleep(random.gauss(wait_mu, wait_sigma))
+
+        # Absolute path of a file
+        old_name = fr"{download_path}\savedrecs.txt"
+        new_name = fr"{download_path}\records_{i}_{i + 299}.txt"
+
+        # Renaming the file
+        os.rename(old_name, new_name)
+
+    if (i + 299) % count_int != 0:
+        print(f'final batch of {i + 300}-{count_int}')
+        WebDriverWait(driver, 30).until(expected_conditions.invisibility_of_element_located(
+            (By.CSS_SELECTOR, ".mat-flat-button .ng-star-inserted")))
+
+        driver.find_element(By.XPATH, "//app-export-menu/div/button").click()
+        # driver.find_element(By.ID, "exportToExcelButton").click()
+        driver.find_element(By.ID, "exportToTabWinButton").click()
+        driver.find_element(By.CSS_SELECTOR, "#radio3 .mat-radio-container").click()
+        driver.find_element(By.NAME, "markFrom").clear()
+        driver.find_element(By.NAME, "markFrom").send_keys(f"{i + 300}")
+        driver.find_element(By.NAME, "markTo").clear()
+        driver.find_element(By.NAME, "markTo").send_keys(f"{count_int}")
+        driver.find_element(By.CSS_SELECTOR, ".margin-top-5 > .dropdown").click()
+        driver.find_element(By.XPATH, "//span[contains(.,\'Full Record\')]").click()
+        driver.find_element(By.CSS_SELECTOR, ".mat-flat-button .ng-star-inserted").click()
+
+        WebDriverWait(driver, 30).until(expected_conditions.invisibility_of_element_located(
+            (By.CSS_SELECTOR, ".mat-flat-button .ng-star-inserted")))
+        time.sleep(random.gauss(wait_mu, wait_sigma))
+
+        # Absolute path of a file
+        old_name = fr"{download_path}\savedrecs.txt"
+        new_name = fr"{download_path}\records_{i + 300}_{count_int}.txt"
+
+        # Renaming the file
+        time.sleep(0.1)
+        os.rename(old_name, new_name)
+
+    time.sleep(2)
+    time.sleep(random.gauss(wait_mu, wait_sigma))
+    driver.close()
+
+def wos_fetch_yearly_output(query_str_list = (
+        "TS=\"web of science\" AND PY=(2008-2010)",
+        "TS=\"artificial intelligence\" AND PY=(2011-2022)"),
+        wait_mu=1, wait_sigma=0.2,debug=False):
+
+    # if isinstance(query_iterable,tuple) or
+
+    for query_str in tqdm(query_str_list):
+        options = Options()
+
+        # query_file_str = query_str.replace('"', '``')
+
+        now = datetime.now()  # current date and time
+        date_time = now.strftime("%Y-%m-%d-%H-%M-%S-%f")+"save"
+
+        # init directory
+        download_path = fr'C:\Users\radvanyi\PycharmProjects\ZSI_analytics\WOS\wos_extract\wos_downloads\aggregated\{date_time}'
+        os.makedirs(download_path, exist_ok=True)
+        files = glob.glob(fr'{download_path}\*')
+        for f in files:
+            os.remove(f)
+
+        options.set_preference("browser.download.folderList", 2)
+        options.set_preference("browser.download.manager.showWhenStarting", False)
+        options.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv/xls")
+        options.set_preference("browser.download.dir", download_path)
+
+        with open(fr'{download_path}\query.txt', "w") as f:
+            f.write(query_str)
+
+        # options.headless = True
+        if debug == False:
+            options.add_argument('--headless')
+        driver = webdriver.Firefox(options=options)
+        driver.get("https://www.webofscience.com/")
+        driver.set_window_size(974, 1040)
+        try:
+            WebDriverWait(driver, 30).until(
+                expected_conditions.visibility_of_element_located((By.ID, "onetrust-reject-all-handler")))
+            driver.find_element(By.ID, "onetrust-reject-all-handler").click()
+        except:
+            close_pendo_windows(driver)
+        WebDriverWait(driver, 30).until(
+            expected_conditions.visibility_of_element_located((By.LINK_TEXT, "Advanced Search")))
+        WebDriverWait(driver, 30).until(
+            expected_conditions.invisibility_of_element_located((By.ID, "onetrust-pc-dark-filter ot-fade-in")))
+
+        # print("Hoooold...")
+        time.sleep(2)
+        WebDriverWait(driver, 30).until(expected_conditions.element_to_be_clickable((By.LINK_TEXT, "Advanced Search")))
+        driver.find_element(By.LINK_TEXT, "Advanced Search").click()
+
+        WebDriverWait(driver, 30).until(expected_conditions.element_to_be_clickable((By.ID, "advancedSearchInputArea")))
+        driver.find_element(By.ID, "advancedSearchInputArea").click()
+        driver.find_element(By.ID, "advancedSearchInputArea").send_keys(query_str)
+        driver.find_element(By.CSS_SELECTOR, ".mat-menu-trigger > svg").click()
+        driver.find_element(By.CSS_SELECTOR, ".cdk-focused > span").click()
+
+        WebDriverWait(driver, 30).until(
+            expected_conditions.visibility_of_element_located((By.CSS_SELECTOR, ".brand-blue")))
+        driver.execute_script("window.scrollTo(0,0)")
+        count_str = driver.find_element(By.CSS_SELECTOR, ".brand-blue").text
+        count_int = int(count_str.replace(",", "").replace(".", "").strip())
+        # print(f'{count_int} records found!')
+
+        driver.find_element(By.XPATH, "//span[contains(.,\'Analyze Results\')]").click()
+        # element = driver.find_element(By.CSS_SELECTOR, ".search-terms")
+        # actions = ActionChains(driver)
+        # actions.move_to_element(element).perform()
+        driver.find_element(By.CSS_SELECTOR, "#snSelectCategories svg").click()
+        driver.find_element(By.XPATH, "//span[contains(.,\'Publication Years\')]").click()
+        driver.find_element(By.XPATH, "//mat-radio-button[@id=\'mat-radio-3\']/label/span/span").click()
+        driver.find_element(By.XPATH, "//span[contains(.,\'Download data table\')]").click()
+
+        # Absolute path of a file
+        old_name = fr"{download_path}\analyze.txt"
+        new_name = fr'{download_path}\analyze_PY_{date_time}_.txt'
+
+        # Renaming the file
+        time.sleep(2)
+        os.rename(old_name, new_name)
+        time.sleep(random.gauss(wait_mu, wait_sigma))
+        driver.close()
+
+if __name__ == '__main__':
+    wos_fetch_entries(debug=False)
+    wos_fetch_yearly_output(debug=False)
--- a/WOS/wos_processing.ipynb
+++ b/WOS/wos_processing.ipynb