blabla/WOS/wos_extract/wos_records_concatter.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import os\n",
    "import shutil\n",
    "import re\n",
    "import spacy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Concatting records for query:\n",
      "\n",
      "['CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUSTRIA OR BELGIUM OR BULGARIA OR CROATIA OR CYPRUS OR CZECH REPUBLIC OR DENMARK OR ESTONIA OR FINLAND OR FRANCE OR GERMANY OR GREECE OR HUNGARY OR IRELAND OR ITALY OR LATVIA OR LITHUANIA OR LUXEMBOURG OR MALTA OR NETHERLANDS OR POLAND OR PORTUGAL OR ROMANIA OR SLOVAKIA OR SLOVENIA OR SPAIN OR SWEDEN OR NORWAY OR SWITZERLAND OR UNITED KINGDOM OR ENGLAND OR WALES OR SCOTLAND OR N IRELAND) AND TS=(\"artificial intelligence*\" OR \"machine* learn*\" OR \"neural network*\" OR \"big data*\" OR \"deep learn*\" OR \"pattern recognition\" OR \"computer vision\" OR \"image classification\" OR \"reinforcement learning\" OR \"support vector machine*\" OR \"recommender system*\" OR \"random forest\" OR \"ensemble model*\" OR \"image processing\" OR \"generative network*\" OR \"ai ethic*\" OR \"natural language processing\" OR \"clustering algorithm*\" OR \"feature extraction\" OR \"time series forecast*\" OR \"anomaly detection\" OR \"identity fraud detection\" OR \"dimensionality reduction\" OR \"feature elicitation\" OR \"chatbot*\" OR \"clustering\" OR \"*supervised learning\" OR \"convolutional network*\" OR \"convolutional neural\" OR \"adversarial network*\" OR \"adversarial neural\" OR \"adversarial machine\" OR \"autoencoder*\" OR \"gated recurrent unit*\" OR \"perceptron*\" OR \"feature learning\" OR \"feature engineering\" OR \"long short-term memor*\" OR \"word embedding*\" OR \"word vector*\" OR \"gradient descent\" OR \"k-nearest neighbor*\" OR \"naive bayes\" OR \"transfer learning\" OR \"fuzzy logic\" OR \"backpropagation\" OR \"computational modeling\" OR \"computational statistic*\" OR \"intelligent agent*\" OR \"expert system*\" OR \"decision tree*\" OR \"Bayesian network*\" OR \"genetic algorithm*\" OR \"swarm intelligence\" OR \"cognitive computing\" OR \"artificial neural network*\" OR \"convolutional neural network*\" OR \"recurrent neural network*\" OR \"ensemble learning\" OR \"data mining\" OR \"artificial general intelligence\" OR \"artificial consciousness\" OR \"evolutionary algorithm*\" OR \"self-organizing map*\" OR \"deep reinforcement learning\" OR \"adversarial machine learning\" OR \"machine vision\" OR \"neural-symbolic integration\" OR \"probabilistic graphical model*\" OR \"hybrid intelligent system*\" OR \"machine creativity\" OR \"explainable AI\" OR \"interactive machine learning\" OR \"artificial emotional intelligence\" OR \"evolutionary computation*\" OR \"human-in-the-loop\" OR \"unsupervised deep learning\" OR \"deep belief network*\" OR \"quantum machine learning\" OR \"artificial immune system*\" OR \"swarm robotics\" OR \"autonomous agents\" OR \"machine ethics\" OR \"collaborative filtering\" OR \"content based filtering\" OR \"pervasive computing\" OR \"ubiquitous computing\" OR \"human-computer interaction\" OR \"cloud computing\" OR \"Internet of Things\" OR \"artificial cognition\" OR \"computational creativity\" OR \"sentiment analy*\" OR \"robotics\" OR \"boltzmann machine*\" OR \"kernel machine*\" OR \"Hopfield network*\" OR \"Hebbian learning\" OR \"latent factor model*\" OR \"non-negative matrix factorization\" OR \"independent component analysis\" OR \"principal component analysis\" OR \"data augmentation\" OR \"image segmentation\" OR \"autoregressive language model*\" OR \"generative pre-trained transformer*\" OR \"smart city\" OR \"smart home\" OR \"smart grid\" OR \"smart health\" OR \"smart manufacturing\" OR \"smart agriculture\" OR \"smart environment\" OR \"smart energy\" OR \"smart mobility\" OR \"smart buildings\" OR \"smart tourism\" OR \"smart logistics\" OR \"smart supply chain\" OR \"smart retail\" OR \"smart waste management\" OR \"smart parking\" OR \"smart governance\" OR \"smart education\" OR \"smart technolog*\" OR \"smart diagnostic*\" OR \"data* analytic*\" OR \"hadoop*\" OR \"mapreduce\" OR \"map$reduce\" OR \"large$ dataset*\" OR \"data warehouse*\" OR \"predictive analytic*\" OR \"no$sql\" OR \"nosql\" OR \"no sql\" OR \"unstructured data*\" OR \"data science*\") AND PY=(2011-2022)']\n"
     ]
    }
   ],
   "source": [
    "folder_token=\"2023-04-06-11-22-23-982324save\"\n",
    "workdir_path=fr\"wos_downloads/entry_batches/{folder_token}\"\n",
    "outfile='wos_records_concat.csv'\n",
    "try:\n",
    "    os.remove(outfile)\n",
    "except FileNotFoundError:\n",
    "    pass\n",
    "with_header=True\n",
    "for root, dirs, files in os.walk(workdir_path):\n",
    "    for filename in files:\n",
    "        path=os.path.join(root, filename)\n",
    "        if filename.startswith(\"records_\"):\n",
    "            chunk = pd.read_csv(path, sep=\"\\t\")\n",
    "            chunk.to_csv(outfile, mode=\"a\", index=False, header=with_header, sep=\"\\t\")\n",
    "            with_header = False\n",
    "        elif filename.startswith(\"query\"):\n",
    "            with open(path,\"r\") as f:\n",
    "                q=f.readlines()\n",
    "            print(\"Concatting records for query:\\n\")\n",
    "            print(q)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "outputs": [],
   "source": [
    "# df_pre = pd.read_excel(r\"C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\WOS\\wos_extract\\v1_\\wosexport1.xls\")\n",
    "# list(df_pre.columns[:-1])"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "outputs": [],
   "source": [
    "col_vals = ['Publication Type',\n",
    " 'Authors',\n",
    " 'Book Authors',\n",
    " 'Book Editors',\n",
    " 'Book Group Authors',\n",
    " 'Author Full Names',\n",
    " 'Book Author Full Names',\n",
    " 'Group Authors',\n",
    " 'Article Title',\n",
    " 'Source Title',\n",
    " 'Book Series Title',\n",
    " 'Book Series Subtitle',\n",
    " 'Language',\n",
    " 'Document Type',\n",
    " 'Conference Title',\n",
    " 'Conference Date',\n",
    " 'Conference Location',\n",
    " 'Conference Sponsor',\n",
    " 'Conference Host',\n",
    " 'Author Keywords',\n",
    " 'Keywords Plus',\n",
    " 'Abstract',\n",
    " 'Addresses',\n",
    " 'Affiliations',\n",
    " 'Reprint Addresses',\n",
    " 'Email Addresses',\n",
    " 'Researcher Ids',\n",
    " 'ORCIDs',\n",
    " 'Funding Orgs',\n",
    " 'Funding Name Preferred',\n",
    " 'Funding Text',\n",
    " 'Cited References',\n",
    " 'Cited Reference Count',\n",
    " 'Times Cited, WoS Core',\n",
    " 'Times Cited, All Databases',\n",
    " '180 Day Usage Count',\n",
    " 'Since 2013 Usage Count',\n",
    " 'Publisher',\n",
    " 'Publisher City',\n",
    " 'Publisher Address',\n",
    " 'ISSN',\n",
    " 'eISSN',\n",
    " 'ISBN',\n",
    " 'Journal Abbreviation',\n",
    " 'Journal ISO Abbreviation',\n",
    " 'Publication Date',\n",
    " 'Publication Year',\n",
    " 'Volume',\n",
    " 'Issue',\n",
    " 'Part Number',\n",
    " 'Supplement',\n",
    " 'Special Issue',\n",
    " 'Meeting Abstract',\n",
    " 'Start Page',\n",
    " 'End Page',\n",
    " 'Article Number',\n",
    " 'DOI',\n",
    " 'DOI Link',\n",
    " 'Book DOI',\n",
    " 'Early Access Date',\n",
    " 'Number of Pages',\n",
    " 'WoS Categories',\n",
    " 'Web of Science Index',\n",
    " 'Research Areas',\n",
    " 'IDS Number',\n",
    " 'Pubmed Id',\n",
    " 'Open Access Designations',\n",
    " 'Highly Cited Status',\n",
    " 'Hot Paper Status',\n",
    " 'Date of Export',\n",
    " 'UT (Unique WOS ID)']"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "outputs": [],
   "source": [
    "df = pd.read_csv(outfile, sep=\"\\t\",low_memory=False)\n",
    "df.columns = col_vals\n",
    "# df\n",
    "df.to_csv(outfile, index=False, header=True, sep=\"\\t\")"
   ],
   "metadata": {
    "collapsed": false
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}