{ "cells": [ { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import os\n", "import shutil\n", "import re\n", "import spacy" ] }, { "cell_type": "code", "execution_count": 19, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Concatting records for query:\n", "\n", "['(CU=PEOPLES R CHINA OR CU=HONG KONG) AND (CU=AUSTRIA OR CU=BELGIUM OR CU=BULGARIA OR CU=CROATIA OR CU=CYPRUS OR CU=CZECH REPUBLIC OR CU=DENMARK OR CU=ESTONIA OR CU=FINLAND OR CU=FRANCE OR CU=GERMANY OR CU=GREECE OR CU=HUNGARY OR CU=IRELAND OR CU=ITALY OR CU=LATVIA OR CU=LITHUANIA OR CU=LUXEMBOURG OR CU=MALTA OR CU=NETHERLANDS OR CU=POLAND OR CU=PORTUGAL OR CU=ROMANIA OR CU=SLOVAKIA OR CU=SLOVENIA OR CU=SPAIN OR CU=SWEDEN OR CU=NORWAY OR CU=SWITZERLAND OR CU=UNITED KINGDOM OR CU=ENGLAND OR CU=WALES OR CU=SCOTLAND OR CU=N IRELAND) AND (TS=(\"artificial intelligence\") OR TS=(\"machine learning\") OR TS=(\"neural network\") OR TS=(\"big data\") OR TS=(\"deep learning\") OR TS=(\"pattern recognition\") OR TS=(\"computer vision\") OR TS=(\"image classification\") OR TS=(\"reinforcement learning\") OR TS=(\"support vector machines\") OR TS=(\"recommender system\") OR TS=(\"random forest\") OR TS=(\"ensemble model\") OR TS=(\"image processing\") OR TS=(\"generative network\") OR TS=(\"ai ethic\") OR TS=(\"natural language processing\") OR TS=(\"clustering algorithm\") OR TS=(\"feature extraction\") OR TS=(\"time series forecast\") OR TS=(\"anomaly detection\") OR TS=(\"identity fraud detection\") OR TS=(\"dimensionality reduction\") OR TS=(\"feature elicitation\") OR TS=(\"chatbot\") OR TS=(\"clustering\") OR TS=(\"unsupervised learning\") OR TS=(\"supervised learning\") OR TS=(\"convolutional network\") OR TS=(\"adversarial network\")) AND PY=(2011-2022)']\n" ] } ], "source": [ "folder_token=\"2023-04-04-12-58-59-994722save\"\n", "workdir_path=fr\"wos_downloads/entry_batches/{folder_token}\"\n", "outfile='wos_records_concat.csv'\n", "try:\n", " os.remove(outfile)\n", "except FileNotFoundError:\n", " pass\n", "with_header=True\n", "for root, dirs, files in os.walk(workdir_path):\n", " for filename in files:\n", " path=os.path.join(root, filename)\n", " if filename.startswith(\"records_\"):\n", " chunk = pd.read_csv(path, sep=\"\\t\")\n", " chunk.to_csv(outfile, mode=\"a\", index=False, header=with_header, sep=\"\\t\")\n", " with_header = False\n", " elif filename.startswith(\"query\"):\n", " with open(path,\"r\") as f:\n", " q=f.readlines()\n", " print(\"Concatting records for query:\\n\")\n", " print(q)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 20, "outputs": [], "source": [ "# df_pre = pd.read_excel(r\"C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\WOS\\wos_extract\\v1_\\wosexport1.xls\")\n", "# list(df_pre.columns[:-1])" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 21, "outputs": [], "source": [ "col_vals = ['Publication Type',\n", " 'Authors',\n", " 'Book Authors',\n", " 'Book Editors',\n", " 'Book Group Authors',\n", " 'Author Full Names',\n", " 'Book Author Full Names',\n", " 'Group Authors',\n", " 'Article Title',\n", " 'Source Title',\n", " 'Book Series Title',\n", " 'Book Series Subtitle',\n", " 'Language',\n", " 'Document Type',\n", " 'Conference Title',\n", " 'Conference Date',\n", " 'Conference Location',\n", " 'Conference Sponsor',\n", " 'Conference Host',\n", " 'Author Keywords',\n", " 'Keywords Plus',\n", " 'Abstract',\n", " 'Addresses',\n", " 'Affiliations',\n", " 'Reprint Addresses',\n", " 'Email Addresses',\n", " 'Researcher Ids',\n", " 'ORCIDs',\n", " 'Funding Orgs',\n", " 'Funding Name Preferred',\n", " 'Funding Text',\n", " 'Cited References',\n", " 'Cited Reference Count',\n", " 'Times Cited, WoS Core',\n", " 'Times Cited, All Databases',\n", " '180 Day Usage Count',\n", " 'Since 2013 Usage Count',\n", " 'Publisher',\n", " 'Publisher City',\n", " 'Publisher Address',\n", " 'ISSN',\n", " 'eISSN',\n", " 'ISBN',\n", " 'Journal Abbreviation',\n", " 'Journal ISO Abbreviation',\n", " 'Publication Date',\n", " 'Publication Year',\n", " 'Volume',\n", " 'Issue',\n", " 'Part Number',\n", " 'Supplement',\n", " 'Special Issue',\n", " 'Meeting Abstract',\n", " 'Start Page',\n", " 'End Page',\n", " 'Article Number',\n", " 'DOI',\n", " 'DOI Link',\n", " 'Book DOI',\n", " 'Early Access Date',\n", " 'Number of Pages',\n", " 'WoS Categories',\n", " 'Web of Science Index',\n", " 'Research Areas',\n", " 'IDS Number',\n", " 'Pubmed Id',\n", " 'Open Access Designations',\n", " 'Highly Cited Status',\n", " 'Hot Paper Status',\n", " 'Date of Export',\n", " 'UT (Unique WOS ID)']" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 22, "outputs": [], "source": [ "df = pd.read_csv(outfile, sep=\"\\t\",low_memory=False)\n", "df.columns = col_vals\n", "# df\n", "df.to_csv(outfile, index=False, header=True, sep=\"\\t\")" ], "metadata": { "collapsed": false } } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 0 }