{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import os\n",
    "import shutil\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Concatting records for query:\n",
      "\n",
      "['CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUSTRIA OR BELGIUM OR BULGARIA OR CROATIA OR CYPRUS OR CZECH REPUBLIC OR DENMARK OR ESTONIA OR FINLAND OR FRANCE OR GERMANY OR GREECE OR HUNGARY OR IRELAND OR ITALY OR LATVIA OR LITHUANIA OR LUXEMBOURG OR MALTA OR NETHERLANDS OR POLAND OR PORTUGAL OR ROMANIA OR SLOVAKIA OR SLOVENIA OR SPAIN OR SWEDEN OR NORWAY OR SWITZERLAND OR UNITED KINGDOM OR ENGLAND OR WALES OR SCOTLAND OR N IRELAND) AND TS=(\"neural network*\" OR \"machine* learn*\" OR \"deep learn*\" OR \"clustering\" OR \"remote sensing\" OR \"convolutional neural\" OR \"Internet of Things\" OR \"feature extraction\" OR \"genetic algorithm*\" OR \"big data*\" OR \"artificial intelligence*\" OR \"data driven*\" OR \"support vector machine*\" OR \"classifier\" OR (\"logistic regression\" NOT (\"p\"  NEAR/0 \"0.0*\" OR \"p$value*\" OR \"p-value*\")) OR \"optimization algorithm*\" OR \"principal component analysis\" OR \"artificial neural network*\" OR \"swarm optimization\" OR \"regularization\" OR (\"linear regression\" NOT (\"p\"  NEAR/0 \"0.0*\" OR \"p$value*\" OR \"p-value*\")) OR \"optimization algorithm\" OR \"random forest\" OR \"cloud computing\" OR \"reinforcement learning\" OR \"computer vision\" OR \"kalman filter*\" OR \"image processing\" OR \"data mining\" OR \"evolutionary algorithm*\" OR \"edge computing\" OR \"*supervised learning\" OR \"computational modeling\" OR \"pattern recognition\" OR \"image classification\" OR \"long short-term memor*\" OR \"robotics\" OR \"image segmentation\" OR \"convex optimization\" OR \"covariance matri*\" OR \"attention mechanism*\" OR \"markov chain\" OR (\"object detection\" NOT \"brain\") OR \"clustering algorithm*\" OR \"recurrent neural network*\" OR \"data augmentation\" OR \"transfer learning\" OR \"loss function*\" OR \"adversarial network*\" OR \"decision tree*\" OR \"multi agent system*\" OR \"fuzzy set*\" OR \"convolutional network*\" OR \"image reconstruction\" OR \"data* analytic*\" OR \"smart grid\" OR \"autoencoder*\" OR \"fuzzy logic\" OR \"radial basis function\" OR \"Bayesian network*\" OR \"dimensionality reduction\" OR (\"face recognition\" NOT \"brain\") OR \"gaussian process\" OR \"anomaly detection\" OR \"k-nearest neighbor*\" OR \"natural language processing\" OR \"monte carlo method\" OR \"large$ dataset*\" OR \"gradient descent\" OR \"support vector regression\" OR \"extreme learning machine*\" OR \"perceptron*\" OR \"model selection\" OR \"ensemble learning\" OR \"representation learning\" OR \"recommender system*\" OR \"target tracking\" OR \"singular value decomposition\" OR \"KNN\" OR \"feature learning\" OR \"smart city\" OR \"sentiment analy*\" OR \"markov decision process\" OR \"k-means clustering\" OR \"independent component analysis\" OR \"brain computer interface\" OR \"human-computer interaction\" OR \"markov chain monte carlo\" OR \"hierarchical clustering\" OR \"semantic web*\" OR \"semi-supervised learning\" OR \"human-robot interact*\" OR \"knowledge graph*\" OR (\"speech recognition\" NOT \"brain\") OR \"ensemble model*\" OR \"fog computing\" OR \"map$reduce\" OR \"evolutionary computation*\" OR \"data science*\" OR \"text mining\" OR \"generative model*\" OR \"active learning\" OR \"swarm intelligence\" OR \"multi-task learning\" OR \"language model*\" OR \"collaborative filtering\" OR \"backpropagation\" OR \"machine vision\" OR \"computer-aided diagnosis\" OR \"gated recurrent unit*\" OR \"lagrange multiplier\" OR \"expert system*\" OR \"learning rate*\" OR \"hadoop*\" OR \"markov process\" OR \"nonlinear optimization\" OR \"learning system\" OR \"self-organizing map*\" OR \"smart manufacturing\" OR \"smart home\" OR \"few shot learning\" OR \"few-shot learning\" OR \"meta-learning\" OR \"meta learning\" OR \"adversarial training\" OR \"zero-shot learning\" OR \"word embedding*\" OR \"expectation maximization algorithm*\" OR \"stochastic gradient descent\" OR \"ridge regression\" OR \"deep belief network*\" OR \"non-negative matrix factorization\" OR \"affective computing\" OR \"latent dirichlet allocation\" OR \"kernel method\" OR \"kernel learning\" OR \"feature engineering\" OR \"variational inference\" OR \"image representation\" OR \"manifold learning\" OR \"t5\" OR \"adversarial example*\" OR \"knowledge distillation\" OR \"time series forecast*\" OR \"variational autoencoder*\" OR \"lasso regression\" OR \"smart energy\" OR \"dbscan\" OR \"multi-label classification\" OR \"intelligent robot*\" OR \"ubiquitous computing\" OR \"gaussian mixture models\" OR \"smart technolog*\" OR \"boltzmann machine*\" OR \"smart buildings\" OR \"predictive analytic*\" OR \"pervasive computing\" OR \"smart agriculture\" OR \"capsule network*\" OR \"human-in-the-loop\" OR \"intelligent agent*\" OR \"ai applications\" OR \"word vector*\" OR \"transformer model*\" OR \"facial recognition\" OR \"unstructured data*\" OR \"restricted boltzmann machine*\" OR \"albert\" OR \"lifelong learning\" OR \"autonomous agents\" OR \"chatbot*\" OR \"Cholesky decomposition\" OR \"no$sql\" OR \"nosql\" OR \"explainable AI\" OR \"seq2seq\" OR \"probabilistic graphical model*\" OR \"QR decomposition\" OR \"L? regulari*\" OR \"unsupervised deep learning\" OR \"data warehouse*\" OR \"quantum machine learning\" OR \"continual learning\" OR \"smart environment\" OR \"multimodal learning\" OR \"smart health\" OR \"artificial immune system*\" OR \"swarm robotics\" OR \"kernel machine*\" OR \"latent factor model*\" OR \"eigendecomposition\" OR \"adversarial machine\" OR \"adversarial machine learning\" OR \"smart mobility\" OR \"sequence-to-sequence model*\" OR \"eigen decomposition\" OR \"adversarial robustness\" OR \"smart parking\" OR \"adversarial neural\" OR \"roberta\" OR \"bidirectional encoder representations from transformer*\" OR \"locally linear embedding*\" OR \"Hebbian learning\" OR \"one-shot learning\" OR \"multimodal representation\" OR \"smart tourism\" OR \"entity extraction\" OR \"adaptive moment estimation\" OR \"ontology learning\" OR \"topic modeling*\") AND PY=(2011-2022)']\n"
     ]
    }
   ],
   "source": [
    "folder_token=\"2023-04-28-17-20-37-539689save\"\n",
    "workdir_path=fr\"wos_downloads/entry_batches/{folder_token}\"\n",
    "outfile='wos_records_concat.csv'\n",
    "try:\n",
    "    os.remove(outfile)\n",
    "except FileNotFoundError:\n",
    "    pass\n",
    "with_header=True\n",
    "for root, dirs, files in os.walk(workdir_path):\n",
    "    for filename in files:\n",
    "        path=os.path.join(root, filename)\n",
    "        if filename.startswith(\"records_\"):\n",
    "            chunk = pd.read_csv(path, sep=\"\\t\")\n",
    "            chunk.to_csv(outfile, mode=\"a\", index=False, header=with_header, sep=\"\\t\")\n",
    "            with_header = False\n",
    "        elif filename.startswith(\"query\"):\n",
    "            with open(path,\"r\") as f:\n",
    "                q=f.readlines()\n",
    "            print(\"Concatting records for query:\\n\")\n",
    "            print(q)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "outputs": [],
   "source": [
    "# df_pre = pd.read_excel(r\"C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\WOS\\wos_extract\\v1_\\wosexport1.xls\")\n",
    "# list(df_pre.columns[:-1])"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "outputs": [],
   "source": [
    "col_vals = ['Publication Type',\n",
    " 'Authors',\n",
    " 'Book Authors',\n",
    " 'Book Editors',\n",
    " 'Book Group Authors',\n",
    " 'Author Full Names',\n",
    " 'Book Author Full Names',\n",
    " 'Group Authors',\n",
    " 'Article Title',\n",
    " 'Source Title',\n",
    " 'Book Series Title',\n",
    " 'Book Series Subtitle',\n",
    " 'Language',\n",
    " 'Document Type',\n",
    " 'Conference Title',\n",
    " 'Conference Date',\n",
    " 'Conference Location',\n",
    " 'Conference Sponsor',\n",
    " 'Conference Host',\n",
    " 'Author Keywords',\n",
    " 'Keywords Plus',\n",
    " 'Abstract',\n",
    " 'Addresses',\n",
    " 'Affiliations',\n",
    " 'Reprint Addresses',\n",
    " 'Email Addresses',\n",
    " 'Researcher Ids',\n",
    " 'ORCIDs',\n",
    " 'Funding Orgs',\n",
    " 'Funding Name Preferred',\n",
    " 'Funding Text',\n",
    " 'Cited References',\n",
    " 'Cited Reference Count',\n",
    " 'Times Cited, WoS Core',\n",
    " 'Times Cited, All Databases',\n",
    " '180 Day Usage Count',\n",
    " 'Since 2013 Usage Count',\n",
    " 'Publisher',\n",
    " 'Publisher City',\n",
    " 'Publisher Address',\n",
    " 'ISSN',\n",
    " 'eISSN',\n",
    " 'ISBN',\n",
    " 'Journal Abbreviation',\n",
    " 'Journal ISO Abbreviation',\n",
    " 'Publication Date',\n",
    " 'Publication Year',\n",
    " 'Volume',\n",
    " 'Issue',\n",
    " 'Part Number',\n",
    " 'Supplement',\n",
    " 'Special Issue',\n",
    " 'Meeting Abstract',\n",
    " 'Start Page',\n",
    " 'End Page',\n",
    " 'Article Number',\n",
    " 'DOI',\n",
    " 'DOI Link',\n",
    " 'Book DOI',\n",
    " 'Early Access Date',\n",
    " 'Number of Pages',\n",
    " 'WoS Categories',\n",
    " 'Web of Science Index',\n",
    " 'Research Areas',\n",
    " 'IDS Number',\n",
    " 'Pubmed Id',\n",
    " 'Open Access Designations',\n",
    " 'Highly Cited Status',\n",
    " 'Hot Paper Status',\n",
    " 'Date of Export',\n",
    " 'UT (Unique WOS ID)']"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "outputs": [],
   "source": [
    "df = pd.read_csv(outfile, sep=\"\\t\",low_memory=False)\n",
    "df.columns = col_vals\n",
    "# df\n",
    "df.to_csv(outfile, index=False, header=True, sep=\"\\t\")"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}