added wos selenium crawler slightly updated WOS data processing
parent
edf23fbcda
commit
da720a6131
@ -0,0 +1,266 @@
|
|||||||
|
import os
|
||||||
|
import glob
|
||||||
|
import pytest
|
||||||
|
import time
|
||||||
|
from datetime import datetime
|
||||||
|
import json
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.common.action_chains import ActionChains
|
||||||
|
from selenium.webdriver.support import expected_conditions
|
||||||
|
from selenium.webdriver.support.wait import WebDriverWait
|
||||||
|
from selenium.webdriver.common.keys import Keys
|
||||||
|
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
||||||
|
# from selenium.webdriver.chrome.options import Options
|
||||||
|
from selenium.webdriver.firefox.options import Options
|
||||||
|
|
||||||
|
from tqdm import tqdm
|
||||||
|
import random
|
||||||
|
|
||||||
|
def close_pendo_windows(driver):
|
||||||
|
'''Close guiding windows'''
|
||||||
|
# Cookies
|
||||||
|
try:
|
||||||
|
driver.find_element(By.XPATH, '//*[@id="onetrust-accept-btn-handler"]').click()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
# "Got it"
|
||||||
|
try:
|
||||||
|
driver.find_element(By.XPATH, '//button[contains(@class, "_pendo-button-primaryButton")]').click()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
# "No thanks"
|
||||||
|
try:
|
||||||
|
driver.find_element(By.XPATH, '//button[contains(@class, "_pendo-button-secondaryButton")]').click()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
# What was it... I forgot...
|
||||||
|
try:
|
||||||
|
driver.find_element(By.XPATH, '//span[contains(@class, "_pendo-close-guide")').click()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
# Overlay
|
||||||
|
try:
|
||||||
|
driver.find_element(By.XPATH, '//div[contains(@class, "cdk-overlay-container")').click()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def wos_fetch_entries(query_str="TS=\"web of science\" AND PY=(2008-2010)",
|
||||||
|
wait_mu=1, wait_sigma=0.2, debug=False):
|
||||||
|
|
||||||
|
now = datetime.now() # current date and time
|
||||||
|
date_time = now.strftime("%Y-%m-%d-%H-%M-%S-%f")+"save"
|
||||||
|
|
||||||
|
options = Options()
|
||||||
|
|
||||||
|
# init directory
|
||||||
|
download_path = fr'C:\Users\radvanyi\PycharmProjects\ZSI_analytics\WOS\wos_extract\wos_downloads\entry_batches\{date_time}'
|
||||||
|
os.makedirs(download_path, exist_ok=True)
|
||||||
|
files = glob.glob(fr'{download_path}\*')
|
||||||
|
for f in files:
|
||||||
|
os.remove(f)
|
||||||
|
|
||||||
|
options.set_preference("browser.download.folderList", 2)
|
||||||
|
options.set_preference("browser.download.manager.showWhenStarting", False)
|
||||||
|
options.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv/xls")
|
||||||
|
options.set_preference("browser.download.dir", download_path)
|
||||||
|
|
||||||
|
with open(fr'{download_path}\query.txt', "w") as f:
|
||||||
|
f.write(query_str)
|
||||||
|
|
||||||
|
# options.headless = True
|
||||||
|
if debug==False:
|
||||||
|
options.add_argument('--headless')
|
||||||
|
driver = webdriver.Firefox(options=options)
|
||||||
|
driver.get("https://www.webofscience.com/")
|
||||||
|
driver.set_window_size(974, 1040)
|
||||||
|
try:
|
||||||
|
WebDriverWait(driver, 30).until(
|
||||||
|
expected_conditions.visibility_of_element_located((By.ID, "onetrust-reject-all-handler")))
|
||||||
|
driver.find_element(By.ID, "onetrust-reject-all-handler").click()
|
||||||
|
except:
|
||||||
|
close_pendo_windows(driver)
|
||||||
|
WebDriverWait(driver, 30).until(
|
||||||
|
expected_conditions.visibility_of_element_located((By.LINK_TEXT, "Advanced Search")))
|
||||||
|
WebDriverWait(driver, 30).until(
|
||||||
|
expected_conditions.invisibility_of_element_located((By.ID, "onetrust-pc-dark-filter ot-fade-in")))
|
||||||
|
|
||||||
|
print("Hoooold...")
|
||||||
|
time.sleep(2)
|
||||||
|
WebDriverWait(driver, 30).until(expected_conditions.element_to_be_clickable((By.LINK_TEXT, "Advanced Search")))
|
||||||
|
driver.find_element(By.LINK_TEXT, "Advanced Search").click()
|
||||||
|
|
||||||
|
WebDriverWait(driver, 30).until(expected_conditions.element_to_be_clickable((By.ID, "advancedSearchInputArea")))
|
||||||
|
driver.find_element(By.ID, "advancedSearchInputArea").click()
|
||||||
|
driver.find_element(By.ID, "advancedSearchInputArea").send_keys(query_str)
|
||||||
|
driver.find_element(By.CSS_SELECTOR, ".mat-menu-trigger > svg").click()
|
||||||
|
driver.find_element(By.CSS_SELECTOR, ".cdk-focused > span").click()
|
||||||
|
|
||||||
|
WebDriverWait(driver, 30).until(expected_conditions.visibility_of_element_located((By.CSS_SELECTOR, ".brand-blue")))
|
||||||
|
driver.execute_script("window.scrollTo(0,0)")
|
||||||
|
count_str = driver.find_element(By.CSS_SELECTOR, ".brand-blue").text
|
||||||
|
count_int = int(count_str.replace(",", "").replace(".", "").strip())
|
||||||
|
print(f'{count_int} records found! Here we go in {int(count_int / 300) + 1} steps...')
|
||||||
|
for i in tqdm(range(1, count_int - 300, 300), position=0, leave=True):
|
||||||
|
# print(f'records {i}-{i+299}')
|
||||||
|
if i == 1:
|
||||||
|
driver.find_element(By.XPATH, "//app-export-menu/div/button").click()
|
||||||
|
# driver.find_element(By.ID, "exportToExcelButton").click()
|
||||||
|
driver.find_element(By.ID, "exportToTabWinButton").click()
|
||||||
|
driver.find_element(By.CSS_SELECTOR, "#radio3 .mat-radio-outer-circle").click()
|
||||||
|
driver.find_element(By.NAME, "markTo").clear()
|
||||||
|
driver.find_element(By.NAME, "markTo").send_keys("300")
|
||||||
|
driver.find_element(By.CSS_SELECTOR, ".margin-top-5 > .dropdown").click()
|
||||||
|
driver.find_element(By.XPATH, "//span[contains(.,\'Full Record\')]").click()
|
||||||
|
driver.find_element(By.CSS_SELECTOR, ".mat-flat-button .ng-star-inserted").click()
|
||||||
|
WebDriverWait(driver, 30).until(expected_conditions.invisibility_of_element_located(
|
||||||
|
(By.CSS_SELECTOR, ".mat-flat-button .ng-star-inserted")))
|
||||||
|
time.sleep(random.gauss(wait_mu, wait_sigma))
|
||||||
|
else:
|
||||||
|
WebDriverWait(driver, 30).until(expected_conditions.invisibility_of_element_located(
|
||||||
|
(By.CSS_SELECTOR, ".mat-flat-button .ng-star-inserted")))
|
||||||
|
|
||||||
|
driver.find_element(By.XPATH, "//app-export-menu/div/button").click()
|
||||||
|
# driver.find_element(By.ID, "exportToExcelButton").click()
|
||||||
|
driver.find_element(By.ID, "exportToTabWinButton").click()
|
||||||
|
driver.find_element(By.CSS_SELECTOR, "#radio3 .mat-radio-container").click()
|
||||||
|
driver.find_element(By.NAME, "markFrom").clear()
|
||||||
|
driver.find_element(By.NAME, "markFrom").send_keys(f"{i}")
|
||||||
|
driver.find_element(By.NAME, "markTo").clear()
|
||||||
|
driver.find_element(By.NAME, "markTo").send_keys(f"{i + 299}")
|
||||||
|
driver.find_element(By.CSS_SELECTOR, ".margin-top-5 > .dropdown").click()
|
||||||
|
driver.find_element(By.XPATH, "//span[contains(.,\'Full Record\')]").click()
|
||||||
|
driver.find_element(By.CSS_SELECTOR, ".mat-flat-button .ng-star-inserted").click()
|
||||||
|
|
||||||
|
WebDriverWait(driver, 30).until(expected_conditions.invisibility_of_element_located(
|
||||||
|
(By.CSS_SELECTOR, ".mat-flat-button .ng-star-inserted")))
|
||||||
|
time.sleep(random.gauss(wait_mu, wait_sigma))
|
||||||
|
|
||||||
|
# Absolute path of a file
|
||||||
|
old_name = fr"{download_path}\savedrecs.txt"
|
||||||
|
new_name = fr"{download_path}\records_{i}_{i + 299}.txt"
|
||||||
|
|
||||||
|
# Renaming the file
|
||||||
|
os.rename(old_name, new_name)
|
||||||
|
|
||||||
|
if (i + 299) % count_int != 0:
|
||||||
|
print(f'final batch of {i + 300}-{count_int}')
|
||||||
|
WebDriverWait(driver, 30).until(expected_conditions.invisibility_of_element_located(
|
||||||
|
(By.CSS_SELECTOR, ".mat-flat-button .ng-star-inserted")))
|
||||||
|
|
||||||
|
driver.find_element(By.XPATH, "//app-export-menu/div/button").click()
|
||||||
|
# driver.find_element(By.ID, "exportToExcelButton").click()
|
||||||
|
driver.find_element(By.ID, "exportToTabWinButton").click()
|
||||||
|
driver.find_element(By.CSS_SELECTOR, "#radio3 .mat-radio-container").click()
|
||||||
|
driver.find_element(By.NAME, "markFrom").clear()
|
||||||
|
driver.find_element(By.NAME, "markFrom").send_keys(f"{i + 300}")
|
||||||
|
driver.find_element(By.NAME, "markTo").clear()
|
||||||
|
driver.find_element(By.NAME, "markTo").send_keys(f"{count_int}")
|
||||||
|
driver.find_element(By.CSS_SELECTOR, ".margin-top-5 > .dropdown").click()
|
||||||
|
driver.find_element(By.XPATH, "//span[contains(.,\'Full Record\')]").click()
|
||||||
|
driver.find_element(By.CSS_SELECTOR, ".mat-flat-button .ng-star-inserted").click()
|
||||||
|
|
||||||
|
WebDriverWait(driver, 30).until(expected_conditions.invisibility_of_element_located(
|
||||||
|
(By.CSS_SELECTOR, ".mat-flat-button .ng-star-inserted")))
|
||||||
|
time.sleep(random.gauss(wait_mu, wait_sigma))
|
||||||
|
|
||||||
|
# Absolute path of a file
|
||||||
|
old_name = fr"{download_path}\savedrecs.txt"
|
||||||
|
new_name = fr"{download_path}\records_{i + 300}_{count_int}.txt"
|
||||||
|
|
||||||
|
# Renaming the file
|
||||||
|
time.sleep(0.1)
|
||||||
|
os.rename(old_name, new_name)
|
||||||
|
|
||||||
|
time.sleep(2)
|
||||||
|
time.sleep(random.gauss(wait_mu, wait_sigma))
|
||||||
|
driver.close()
|
||||||
|
|
||||||
|
def wos_fetch_yearly_output(query_str_list = (
|
||||||
|
"TS=\"web of science\" AND PY=(2008-2010)",
|
||||||
|
"TS=\"artificial intelligence\" AND PY=(2011-2022)"),
|
||||||
|
wait_mu=1, wait_sigma=0.2,debug=False):
|
||||||
|
|
||||||
|
# if isinstance(query_iterable,tuple) or
|
||||||
|
|
||||||
|
for query_str in tqdm(query_str_list):
|
||||||
|
options = Options()
|
||||||
|
|
||||||
|
# query_file_str = query_str.replace('"', '``')
|
||||||
|
|
||||||
|
now = datetime.now() # current date and time
|
||||||
|
date_time = now.strftime("%Y-%m-%d-%H-%M-%S-%f")+"save"
|
||||||
|
|
||||||
|
# init directory
|
||||||
|
download_path = fr'C:\Users\radvanyi\PycharmProjects\ZSI_analytics\WOS\wos_extract\wos_downloads\aggregated\{date_time}'
|
||||||
|
os.makedirs(download_path, exist_ok=True)
|
||||||
|
files = glob.glob(fr'{download_path}\*')
|
||||||
|
for f in files:
|
||||||
|
os.remove(f)
|
||||||
|
|
||||||
|
options.set_preference("browser.download.folderList", 2)
|
||||||
|
options.set_preference("browser.download.manager.showWhenStarting", False)
|
||||||
|
options.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv/xls")
|
||||||
|
options.set_preference("browser.download.dir", download_path)
|
||||||
|
|
||||||
|
with open(fr'{download_path}\query.txt', "w") as f:
|
||||||
|
f.write(query_str)
|
||||||
|
|
||||||
|
# options.headless = True
|
||||||
|
if debug == False:
|
||||||
|
options.add_argument('--headless')
|
||||||
|
driver = webdriver.Firefox(options=options)
|
||||||
|
driver.get("https://www.webofscience.com/")
|
||||||
|
driver.set_window_size(974, 1040)
|
||||||
|
try:
|
||||||
|
WebDriverWait(driver, 30).until(
|
||||||
|
expected_conditions.visibility_of_element_located((By.ID, "onetrust-reject-all-handler")))
|
||||||
|
driver.find_element(By.ID, "onetrust-reject-all-handler").click()
|
||||||
|
except:
|
||||||
|
close_pendo_windows(driver)
|
||||||
|
WebDriverWait(driver, 30).until(
|
||||||
|
expected_conditions.visibility_of_element_located((By.LINK_TEXT, "Advanced Search")))
|
||||||
|
WebDriverWait(driver, 30).until(
|
||||||
|
expected_conditions.invisibility_of_element_located((By.ID, "onetrust-pc-dark-filter ot-fade-in")))
|
||||||
|
|
||||||
|
# print("Hoooold...")
|
||||||
|
time.sleep(2)
|
||||||
|
WebDriverWait(driver, 30).until(expected_conditions.element_to_be_clickable((By.LINK_TEXT, "Advanced Search")))
|
||||||
|
driver.find_element(By.LINK_TEXT, "Advanced Search").click()
|
||||||
|
|
||||||
|
WebDriverWait(driver, 30).until(expected_conditions.element_to_be_clickable((By.ID, "advancedSearchInputArea")))
|
||||||
|
driver.find_element(By.ID, "advancedSearchInputArea").click()
|
||||||
|
driver.find_element(By.ID, "advancedSearchInputArea").send_keys(query_str)
|
||||||
|
driver.find_element(By.CSS_SELECTOR, ".mat-menu-trigger > svg").click()
|
||||||
|
driver.find_element(By.CSS_SELECTOR, ".cdk-focused > span").click()
|
||||||
|
|
||||||
|
WebDriverWait(driver, 30).until(
|
||||||
|
expected_conditions.visibility_of_element_located((By.CSS_SELECTOR, ".brand-blue")))
|
||||||
|
driver.execute_script("window.scrollTo(0,0)")
|
||||||
|
count_str = driver.find_element(By.CSS_SELECTOR, ".brand-blue").text
|
||||||
|
count_int = int(count_str.replace(",", "").replace(".", "").strip())
|
||||||
|
# print(f'{count_int} records found!')
|
||||||
|
|
||||||
|
driver.find_element(By.XPATH, "//span[contains(.,\'Analyze Results\')]").click()
|
||||||
|
# element = driver.find_element(By.CSS_SELECTOR, ".search-terms")
|
||||||
|
# actions = ActionChains(driver)
|
||||||
|
# actions.move_to_element(element).perform()
|
||||||
|
driver.find_element(By.CSS_SELECTOR, "#snSelectCategories svg").click()
|
||||||
|
driver.find_element(By.XPATH, "//span[contains(.,\'Publication Years\')]").click()
|
||||||
|
driver.find_element(By.XPATH, "//mat-radio-button[@id=\'mat-radio-3\']/label/span/span").click()
|
||||||
|
driver.find_element(By.XPATH, "//span[contains(.,\'Download data table\')]").click()
|
||||||
|
|
||||||
|
# Absolute path of a file
|
||||||
|
old_name = fr"{download_path}\analyze.txt"
|
||||||
|
new_name = fr'{download_path}\analyze_PY_{date_time}_.txt'
|
||||||
|
|
||||||
|
# Renaming the file
|
||||||
|
time.sleep(2)
|
||||||
|
os.rename(old_name, new_name)
|
||||||
|
time.sleep(random.gauss(wait_mu, wait_sigma))
|
||||||
|
driver.close()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
wos_fetch_entries(debug=False)
|
||||||
|
wos_fetch_yearly_output(debug=False)
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue