import os import glob import pytest import time from datetime import datetime import json from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.support import expected_conditions from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.desired_capabilities import DesiredCapabilities # from selenium.webdriver.chrome.options import Options from selenium.webdriver.firefox.options import Options from tqdm import tqdm import random def close_pendo_windows(driver): '''Close guiding windows''' # Cookies try: driver.find_element(By.XPATH, '//*[@id="onetrust-accept-btn-handler"]').click() except: pass # "Got it" try: driver.find_element(By.XPATH, '//button[contains(@class, "_pendo-button-primaryButton")]').click() except: pass # "No thanks" try: driver.find_element(By.XPATH, '//button[contains(@class, "_pendo-button-secondaryButton")]').click() except: pass # What was it... I forgot... try: driver.find_element(By.XPATH, '//span[contains(@class, "_pendo-close-guide")').click() except: pass # Overlay try: driver.find_element(By.XPATH, '//div[contains(@class, "cdk-overlay-container")').click() except: pass def wos_fetch_entries(query_str="TS=\"web of science\" AND PY=(2008-2010)", wait_mu=1, wait_sigma=0.2, debug=False): now = datetime.now() # current date and time date_time = now.strftime("%Y-%m-%d-%H-%M-%S-%f")+"save" options = Options() # init directory download_path = fr'C:\Users\radvanyi\PycharmProjects\ZSI_analytics\WOS\wos_extract\wos_downloads\entry_batches\{date_time}' os.makedirs(download_path, exist_ok=True) files = glob.glob(fr'{download_path}\*') for f in files: os.remove(f) options.set_preference("browser.download.folderList", 2) options.set_preference("browser.download.manager.showWhenStarting", False) options.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv/xls") options.set_preference("browser.download.dir", download_path) with open(fr'{download_path}\query.txt', "w") as f: f.write(query_str) # options.headless = True if debug==False: options.add_argument('--headless') driver = webdriver.Firefox(options=options) driver.get("https://www.webofscience.com/") driver.set_window_size(974, 1040) try: WebDriverWait(driver, 30).until( expected_conditions.visibility_of_element_located((By.ID, "onetrust-reject-all-handler"))) driver.find_element(By.ID, "onetrust-reject-all-handler").click() except: close_pendo_windows(driver) WebDriverWait(driver, 30).until( expected_conditions.visibility_of_element_located((By.LINK_TEXT, "Advanced Search"))) WebDriverWait(driver, 30).until( expected_conditions.invisibility_of_element_located((By.ID, "onetrust-pc-dark-filter ot-fade-in"))) print("Hoooold...") time.sleep(2) WebDriverWait(driver, 30).until(expected_conditions.element_to_be_clickable((By.LINK_TEXT, "Advanced Search"))) driver.find_element(By.LINK_TEXT, "Advanced Search").click() WebDriverWait(driver, 30).until(expected_conditions.element_to_be_clickable((By.ID, "advancedSearchInputArea"))) driver.find_element(By.ID, "advancedSearchInputArea").click() driver.find_element(By.ID, "advancedSearchInputArea").send_keys(query_str) driver.find_element(By.CSS_SELECTOR, ".mat-menu-trigger > svg").click() driver.find_element(By.CSS_SELECTOR, ".cdk-focused > span").click() WebDriverWait(driver, 30).until(expected_conditions.visibility_of_element_located((By.CSS_SELECTOR, ".brand-blue"))) driver.execute_script("window.scrollTo(0,0)") count_str = driver.find_element(By.CSS_SELECTOR, ".brand-blue").text count_int = int(count_str.replace(",", "").replace(".", "").strip()) print(f'{count_int} records found! Here we go in {int(count_int / 300) + 1} steps...') for i in tqdm(range(1, count_int - 300, 300), position=0, leave=True): # print(f'records {i}-{i+299}') if i == 1: driver.find_element(By.XPATH, "//app-export-menu/div/button").click() # driver.find_element(By.ID, "exportToExcelButton").click() driver.find_element(By.ID, "exportToTabWinButton").click() driver.find_element(By.CSS_SELECTOR, "#radio3 .mat-radio-outer-circle").click() driver.find_element(By.NAME, "markTo").clear() driver.find_element(By.NAME, "markTo").send_keys("300") driver.find_element(By.CSS_SELECTOR, ".margin-top-5 > .dropdown").click() driver.find_element(By.XPATH, "//span[contains(.,\'Full Record\')]").click() driver.find_element(By.CSS_SELECTOR, ".mat-flat-button .ng-star-inserted").click() WebDriverWait(driver, 30).until(expected_conditions.invisibility_of_element_located( (By.CSS_SELECTOR, ".mat-flat-button .ng-star-inserted"))) time.sleep(random.gauss(wait_mu, wait_sigma)) else: WebDriverWait(driver, 30).until(expected_conditions.invisibility_of_element_located( (By.CSS_SELECTOR, ".mat-flat-button .ng-star-inserted"))) driver.find_element(By.XPATH, "//app-export-menu/div/button").click() # driver.find_element(By.ID, "exportToExcelButton").click() driver.find_element(By.ID, "exportToTabWinButton").click() driver.find_element(By.CSS_SELECTOR, "#radio3 .mat-radio-container").click() driver.find_element(By.NAME, "markFrom").clear() driver.find_element(By.NAME, "markFrom").send_keys(f"{i}") driver.find_element(By.NAME, "markTo").clear() driver.find_element(By.NAME, "markTo").send_keys(f"{i + 299}") driver.find_element(By.CSS_SELECTOR, ".margin-top-5 > .dropdown").click() driver.find_element(By.XPATH, "//span[contains(.,\'Full Record\')]").click() driver.find_element(By.CSS_SELECTOR, ".mat-flat-button .ng-star-inserted").click() WebDriverWait(driver, 30).until(expected_conditions.invisibility_of_element_located( (By.CSS_SELECTOR, ".mat-flat-button .ng-star-inserted"))) time.sleep(random.gauss(wait_mu, wait_sigma)) # Absolute path of a file old_name = fr"{download_path}\savedrecs.txt" new_name = fr"{download_path}\records_{i}_{i + 299}.txt" # Renaming the file os.rename(old_name, new_name) if (i + 299) % count_int != 0: print(f'final batch of {i + 300}-{count_int}') WebDriverWait(driver, 30).until(expected_conditions.invisibility_of_element_located( (By.CSS_SELECTOR, ".mat-flat-button .ng-star-inserted"))) driver.find_element(By.XPATH, "//app-export-menu/div/button").click() # driver.find_element(By.ID, "exportToExcelButton").click() driver.find_element(By.ID, "exportToTabWinButton").click() driver.find_element(By.CSS_SELECTOR, "#radio3 .mat-radio-container").click() driver.find_element(By.NAME, "markFrom").clear() driver.find_element(By.NAME, "markFrom").send_keys(f"{i + 300}") driver.find_element(By.NAME, "markTo").clear() driver.find_element(By.NAME, "markTo").send_keys(f"{count_int}") driver.find_element(By.CSS_SELECTOR, ".margin-top-5 > .dropdown").click() driver.find_element(By.XPATH, "//span[contains(.,\'Full Record\')]").click() driver.find_element(By.CSS_SELECTOR, ".mat-flat-button .ng-star-inserted").click() WebDriverWait(driver, 30).until(expected_conditions.invisibility_of_element_located( (By.CSS_SELECTOR, ".mat-flat-button .ng-star-inserted"))) time.sleep(random.gauss(wait_mu, wait_sigma)) # Absolute path of a file old_name = fr"{download_path}\savedrecs.txt" new_name = fr"{download_path}\records_{i + 300}_{count_int}.txt" # Renaming the file time.sleep(0.1) try: os.rename(old_name, new_name) except PermissionError: time.sleep(5) os.rename(old_name, new_name) except FileNotFoundError: time.sleep(5) os.rename(old_name, new_name) time.sleep(2) time.sleep(random.gauss(wait_mu, wait_sigma)) driver.close() def wos_fetch_yearly_output(query_str_list = ( "TS=\"web of science\" AND PY=(2008-2010)", "TS=\"artificial intelligence\" AND PY=(2011-2022)"), wait_mu=1, wait_sigma=0.2,debug=False): # if isinstance(query_iterable,tuple) or for query_str in tqdm(query_str_list): options = Options() # query_file_str = query_str.replace('"', '``') now = datetime.now() # current date and time date_time = now.strftime("%Y-%m-%d-%H-%M-%S-%f")+"save" # init directory download_path = fr'C:\Users\radvanyi\PycharmProjects\ZSI_analytics\WOS\wos_extract\wos_downloads\aggregated\{date_time}' os.makedirs(download_path, exist_ok=True) files = glob.glob(fr'{download_path}\*') for f in files: os.remove(f) options.set_preference("browser.download.folderList", 2) options.set_preference("browser.download.manager.showWhenStarting", False) options.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv/xls") options.set_preference("browser.download.dir", download_path) with open(fr'{download_path}\query.txt', "w") as f: f.write(query_str) # options.headless = True if debug == False: options.add_argument('--headless') driver = webdriver.Firefox(options=options) driver.get("https://www.webofscience.com/") driver.set_window_size(974, 1040) try: WebDriverWait(driver, 30).until( expected_conditions.visibility_of_element_located((By.ID, "onetrust-reject-all-handler"))) driver.find_element(By.ID, "onetrust-reject-all-handler").click() except: close_pendo_windows(driver) WebDriverWait(driver, 30).until( expected_conditions.visibility_of_element_located((By.LINK_TEXT, "Advanced Search"))) WebDriverWait(driver, 30).until( expected_conditions.invisibility_of_element_located((By.ID, "onetrust-pc-dark-filter ot-fade-in"))) # print("Hoooold...") time.sleep(2) WebDriverWait(driver, 30).until(expected_conditions.element_to_be_clickable((By.LINK_TEXT, "Advanced Search"))) driver.find_element(By.LINK_TEXT, "Advanced Search").click() WebDriverWait(driver, 30).until(expected_conditions.element_to_be_clickable((By.ID, "advancedSearchInputArea"))) driver.find_element(By.ID, "advancedSearchInputArea").click() driver.find_element(By.ID, "advancedSearchInputArea").send_keys(query_str) driver.find_element(By.CSS_SELECTOR, ".mat-menu-trigger > svg").click() driver.find_element(By.CSS_SELECTOR, ".cdk-focused > span").click() WebDriverWait(driver, 30).until( expected_conditions.visibility_of_element_located((By.CSS_SELECTOR, ".brand-blue"))) driver.execute_script("window.scrollTo(0,0)") count_str = driver.find_element(By.CSS_SELECTOR, ".brand-blue").text count_int = int(count_str.replace(",", "").replace(".", "").strip()) # print(f'{count_int} records found!') driver.find_element(By.XPATH, "//span[contains(.,\'Analyze Results\')]").click() # element = driver.find_element(By.CSS_SELECTOR, ".search-terms") # actions = ActionChains(driver) # actions.move_to_element(element).perform() # driver.find_element(By.CSS_SELECTOR, "#snSelectCategories svg").click() driver.find_element(By.XPATH, "//button[contains(.,\'Web of Science Categories\')]").click() driver.find_element(By.XPATH, "//span[contains(.,\'Publication Years\')]").click() driver.find_element(By.XPATH, "//mat-radio-button[@id=\'mat-radio-3\']/label/span/span").click() driver.find_element(By.XPATH, "//span[contains(.,\'Download data table\')]").click() # Absolute path of a file old_name = fr"{download_path}\analyze.txt" new_name = fr'{download_path}\analyze_PY_{date_time}_.txt' # Renaming the file time.sleep(1) try: os.rename(old_name, new_name) except FileNotFoundError: try: time.sleep(2) os.rename(old_name, new_name) except FileNotFoundError: print(query_str, ' cannot be processed for some reason :(') time.sleep(random.gauss(wait_mu, wait_sigma)) driver.close() if __name__ == '__main__': wos_fetch_entries(debug=False) wos_fetch_yearly_output(debug=False)