{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import os\n",
    "import shutil\n",
    "import re\n",
    "import spacy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Concatting records for query:\n",
      "\n",
      "['(CU=PEOPLES R CHINA OR CU=HONG KONG) AND (CU=AUSTRIA OR CU=BELGIUM OR CU=BULGARIA OR CU=CROATIA OR CU=CYPRUS OR CU=CZECH REPUBLIC OR CU=DENMARK OR CU=ESTONIA OR CU=FINLAND OR CU=FRANCE OR CU=GERMANY OR CU=GREECE OR CU=HUNGARY OR CU=IRELAND OR CU=ITALY OR CU=LATVIA OR CU=LITHUANIA OR CU=LUXEMBOURG OR CU=MALTA OR CU=NETHERLANDS OR CU=POLAND OR CU=PORTUGAL OR CU=ROMANIA OR CU=SLOVAKIA OR CU=SLOVENIA OR CU=SPAIN OR CU=SWEDEN OR CU=NORWAY OR CU=SWITZERLAND OR CU=UNITED KINGDOM OR CU=ENGLAND OR CU=WALES OR CU=SCOTLAND OR CU=N IRELAND) AND (TS=(\"artificial intelligence\") OR TS=(\"machine learning\") OR TS=(\"neural network\") OR TS=(\"big data\") OR TS=(\"deep learning\") OR TS=(\"pattern recognition\") OR TS=(\"computer vision\") OR TS=(\"image classification\") OR TS=(\"reinforcement learning\") OR TS=(\"support vector machines\") OR TS=(\"recommender system\") OR TS=(\"random forest\") OR TS=(\"ensemble model\") OR TS=(\"image processing\") OR TS=(\"generative network\") OR TS=(\"ai ethic\") OR TS=(\"natural language processing\") OR TS=(\"clustering algorithm\") OR TS=(\"feature extraction\") OR TS=(\"time series forecast\") OR TS=(\"anomaly detection\") OR TS=(\"identity fraud detection\") OR TS=(\"dimensionality reduction\") OR TS=(\"feature elicitation\") OR TS=(\"chatbot\") OR TS=(\"clustering\") OR TS=(\"unsupervised learning\") OR TS=(\"supervised learning\") OR TS=(\"convolutional network\") OR TS=(\"adversarial network\")) AND PY=(2011-2022)']\n"
     ]
    }
   ],
   "source": [
    "folder_token=\"2023-04-04-12-58-59-994722save\"\n",
    "workdir_path=fr\"wos_downloads/entry_batches/{folder_token}\"\n",
    "outfile='wos_records_concat.csv'\n",
    "try:\n",
    "    os.remove(outfile)\n",
    "except FileNotFoundError:\n",
    "    pass\n",
    "with_header=True\n",
    "for root, dirs, files in os.walk(workdir_path):\n",
    "    for filename in files:\n",
    "        path=os.path.join(root, filename)\n",
    "        if filename.startswith(\"records_\"):\n",
    "            chunk = pd.read_csv(path, sep=\"\\t\")\n",
    "            chunk.to_csv(outfile, mode=\"a\", index=False, header=with_header, sep=\"\\t\")\n",
    "            with_header = False\n",
    "        elif filename.startswith(\"query\"):\n",
    "            with open(path,\"r\") as f:\n",
    "                q=f.readlines()\n",
    "            print(\"Concatting records for query:\\n\")\n",
    "            print(q)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "outputs": [],
   "source": [
    "# df_pre = pd.read_excel(r\"C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\WOS\\wos_extract\\v1_\\wosexport1.xls\")\n",
    "# list(df_pre.columns[:-1])"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "outputs": [],
   "source": [
    "col_vals = ['Publication Type',\n",
    " 'Authors',\n",
    " 'Book Authors',\n",
    " 'Book Editors',\n",
    " 'Book Group Authors',\n",
    " 'Author Full Names',\n",
    " 'Book Author Full Names',\n",
    " 'Group Authors',\n",
    " 'Article Title',\n",
    " 'Source Title',\n",
    " 'Book Series Title',\n",
    " 'Book Series Subtitle',\n",
    " 'Language',\n",
    " 'Document Type',\n",
    " 'Conference Title',\n",
    " 'Conference Date',\n",
    " 'Conference Location',\n",
    " 'Conference Sponsor',\n",
    " 'Conference Host',\n",
    " 'Author Keywords',\n",
    " 'Keywords Plus',\n",
    " 'Abstract',\n",
    " 'Addresses',\n",
    " 'Affiliations',\n",
    " 'Reprint Addresses',\n",
    " 'Email Addresses',\n",
    " 'Researcher Ids',\n",
    " 'ORCIDs',\n",
    " 'Funding Orgs',\n",
    " 'Funding Name Preferred',\n",
    " 'Funding Text',\n",
    " 'Cited References',\n",
    " 'Cited Reference Count',\n",
    " 'Times Cited, WoS Core',\n",
    " 'Times Cited, All Databases',\n",
    " '180 Day Usage Count',\n",
    " 'Since 2013 Usage Count',\n",
    " 'Publisher',\n",
    " 'Publisher City',\n",
    " 'Publisher Address',\n",
    " 'ISSN',\n",
    " 'eISSN',\n",
    " 'ISBN',\n",
    " 'Journal Abbreviation',\n",
    " 'Journal ISO Abbreviation',\n",
    " 'Publication Date',\n",
    " 'Publication Year',\n",
    " 'Volume',\n",
    " 'Issue',\n",
    " 'Part Number',\n",
    " 'Supplement',\n",
    " 'Special Issue',\n",
    " 'Meeting Abstract',\n",
    " 'Start Page',\n",
    " 'End Page',\n",
    " 'Article Number',\n",
    " 'DOI',\n",
    " 'DOI Link',\n",
    " 'Book DOI',\n",
    " 'Early Access Date',\n",
    " 'Number of Pages',\n",
    " 'WoS Categories',\n",
    " 'Web of Science Index',\n",
    " 'Research Areas',\n",
    " 'IDS Number',\n",
    " 'Pubmed Id',\n",
    " 'Open Access Designations',\n",
    " 'Highly Cited Status',\n",
    " 'Hot Paper Status',\n",
    " 'Date of Export',\n",
    " 'UT (Unique WOS ID)']"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "outputs": [],
   "source": [
    "df = pd.read_csv(outfile, sep=\"\\t\",low_memory=False)\n",
    "df.columns = col_vals\n",
    "# df\n",
    "df.to_csv(outfile, index=False, header=True, sep=\"\\t\")"
   ],
   "metadata": {
    "collapsed": false
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}