You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
blabla/WOS/wos_extract/wos_records_concatter.ipynb

189 lines
6.0 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import os\n",
"import shutil\n",
"import re\n",
"import spacy"
]
},
{
"cell_type": "code",
"execution_count": 19,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Concatting records for query:\n",
"\n",
"['(CU=PEOPLES R CHINA OR CU=HONG KONG) AND (CU=AUSTRIA OR CU=BELGIUM OR CU=BULGARIA OR CU=CROATIA OR CU=CYPRUS OR CU=CZECH REPUBLIC OR CU=DENMARK OR CU=ESTONIA OR CU=FINLAND OR CU=FRANCE OR CU=GERMANY OR CU=GREECE OR CU=HUNGARY OR CU=IRELAND OR CU=ITALY OR CU=LATVIA OR CU=LITHUANIA OR CU=LUXEMBOURG OR CU=MALTA OR CU=NETHERLANDS OR CU=POLAND OR CU=PORTUGAL OR CU=ROMANIA OR CU=SLOVAKIA OR CU=SLOVENIA OR CU=SPAIN OR CU=SWEDEN OR CU=NORWAY OR CU=SWITZERLAND OR CU=UNITED KINGDOM OR CU=ENGLAND OR CU=WALES OR CU=SCOTLAND OR CU=N IRELAND) AND (TS=(\"artificial intelligence\") OR TS=(\"machine learning\") OR TS=(\"neural network\") OR TS=(\"big data\") OR TS=(\"deep learning\") OR TS=(\"pattern recognition\") OR TS=(\"computer vision\") OR TS=(\"image classification\") OR TS=(\"reinforcement learning\") OR TS=(\"support vector machines\") OR TS=(\"recommender system\") OR TS=(\"random forest\") OR TS=(\"ensemble model\") OR TS=(\"image processing\") OR TS=(\"generative network\") OR TS=(\"ai ethic\") OR TS=(\"natural language processing\") OR TS=(\"clustering algorithm\") OR TS=(\"feature extraction\") OR TS=(\"time series forecast\") OR TS=(\"anomaly detection\") OR TS=(\"identity fraud detection\") OR TS=(\"dimensionality reduction\") OR TS=(\"feature elicitation\") OR TS=(\"chatbot\") OR TS=(\"clustering\") OR TS=(\"unsupervised learning\") OR TS=(\"supervised learning\") OR TS=(\"convolutional network\") OR TS=(\"adversarial network\")) AND PY=(2011-2022)']\n"
]
}
],
"source": [
"folder_token=\"2023-04-04-12-58-59-994722save\"\n",
"workdir_path=fr\"wos_downloads/entry_batches/{folder_token}\"\n",
"outfile='wos_records_concat.csv'\n",
"try:\n",
" os.remove(outfile)\n",
"except FileNotFoundError:\n",
" pass\n",
"with_header=True\n",
"for root, dirs, files in os.walk(workdir_path):\n",
" for filename in files:\n",
" path=os.path.join(root, filename)\n",
" if filename.startswith(\"records_\"):\n",
" chunk = pd.read_csv(path, sep=\"\\t\")\n",
" chunk.to_csv(outfile, mode=\"a\", index=False, header=with_header, sep=\"\\t\")\n",
" with_header = False\n",
" elif filename.startswith(\"query\"):\n",
" with open(path,\"r\") as f:\n",
" q=f.readlines()\n",
" print(\"Concatting records for query:\\n\")\n",
" print(q)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 20,
"outputs": [],
"source": [
"# df_pre = pd.read_excel(r\"C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\WOS\\wos_extract\\v1_\\wosexport1.xls\")\n",
"# list(df_pre.columns[:-1])"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 21,
"outputs": [],
"source": [
"col_vals = ['Publication Type',\n",
" 'Authors',\n",
" 'Book Authors',\n",
" 'Book Editors',\n",
" 'Book Group Authors',\n",
" 'Author Full Names',\n",
" 'Book Author Full Names',\n",
" 'Group Authors',\n",
" 'Article Title',\n",
" 'Source Title',\n",
" 'Book Series Title',\n",
" 'Book Series Subtitle',\n",
" 'Language',\n",
" 'Document Type',\n",
" 'Conference Title',\n",
" 'Conference Date',\n",
" 'Conference Location',\n",
" 'Conference Sponsor',\n",
" 'Conference Host',\n",
" 'Author Keywords',\n",
" 'Keywords Plus',\n",
" 'Abstract',\n",
" 'Addresses',\n",
" 'Affiliations',\n",
" 'Reprint Addresses',\n",
" 'Email Addresses',\n",
" 'Researcher Ids',\n",
" 'ORCIDs',\n",
" 'Funding Orgs',\n",
" 'Funding Name Preferred',\n",
" 'Funding Text',\n",
" 'Cited References',\n",
" 'Cited Reference Count',\n",
" 'Times Cited, WoS Core',\n",
" 'Times Cited, All Databases',\n",
" '180 Day Usage Count',\n",
" 'Since 2013 Usage Count',\n",
" 'Publisher',\n",
" 'Publisher City',\n",
" 'Publisher Address',\n",
" 'ISSN',\n",
" 'eISSN',\n",
" 'ISBN',\n",
" 'Journal Abbreviation',\n",
" 'Journal ISO Abbreviation',\n",
" 'Publication Date',\n",
" 'Publication Year',\n",
" 'Volume',\n",
" 'Issue',\n",
" 'Part Number',\n",
" 'Supplement',\n",
" 'Special Issue',\n",
" 'Meeting Abstract',\n",
" 'Start Page',\n",
" 'End Page',\n",
" 'Article Number',\n",
" 'DOI',\n",
" 'DOI Link',\n",
" 'Book DOI',\n",
" 'Early Access Date',\n",
" 'Number of Pages',\n",
" 'WoS Categories',\n",
" 'Web of Science Index',\n",
" 'Research Areas',\n",
" 'IDS Number',\n",
" 'Pubmed Id',\n",
" 'Open Access Designations',\n",
" 'Highly Cited Status',\n",
" 'Hot Paper Status',\n",
" 'Date of Export',\n",
" 'UT (Unique WOS ID)']"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 22,
"outputs": [],
"source": [
"df = pd.read_csv(outfile, sep=\"\\t\",low_memory=False)\n",
"df.columns = col_vals\n",
"# df\n",
"df.to_csv(outfile, index=False, header=True, sep=\"\\t\")"
],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}