query composer simplified syntax

utku_keyword_suggestion
radvanyimome 1 year ago
parent b5cdfe2b06
commit 19ff407a5a

File diff suppressed because one or more lines are too long

@ -152,4 +152,4 @@
},
"nbformat": 4,
"nbformat_minor": 0
}
}

@ -1 +1,138 @@
artificial intelligence,machine learning,neural network,big data,deep learning,pattern recognition,computer vision, image classification, reinforcement learning, support vector machines, recommender system, random forest, ensemble model, image processing, generative network, ai ethic, natural language processing, clustering algorithm, feature extraction, time series forecast, anomaly detection, identity fraud detection, dimensionality reduction, feature elicitation, chatbot, clustering, unsupervised learning, supervised learning, convolutional network, adversarial network
artificial intelligence*,
machine* learn*,
neural network*,
big data*,
deep learn*,
pattern recognition,
computer vision,
image classification,
reinforcement learning,
support vector machine*,
recommender system*,
random forest,
ensemble model*,
image processing,
generative network*,
ai ethic*,
natural language processing,
clustering algorithm*,
feature extraction,
time series forecast*,
anomaly detection,
identity fraud detection,
dimensionality reduction,
feature elicitation,
chatbot*,
clustering,
*supervised learning,
convolutional network*,
convolutional neural,
adversarial network*,
adversarial neural,
adversarial machine,
autoencoder*,
gated recurrent unit*,
perceptron*,
feature learning,
feature engineering,
long short-term memor*,
word embedding*,
word vector*,
gradient descent,
k-nearest neighbor*,
naive bayes,
transfer learning,
fuzzy logic,
backpropagation,
computational modeling,
computational statistic*,
intelligent agent*,
expert system*,
decision tree*,
Bayesian network*,
genetic algorithm*,
swarm intelligence,
cognitive computing,
artificial neural network*
convolutional neural network*,
recurrent neural network*,
ensemble learning,
data mining,
artificial general intelligence,
artificial consciousness,
evolutionary algorithm*,
self-organizing map*,
deep reinforcement learning,
adversarial machine learning,
machine vision,
neural-symbolic integration,
probabilistic graphical model*,
hybrid intelligent system*,
machine creativity,
explainable AI,
interactive machine learning,
artificial emotional intelligence,
evolutionary computation*,
human-in-the-loop,
unsupervised deep learning,
deep belief network*,
quantum machine learning,
artificial immune system*,
swarm robotics,
autonomous agents,
machine ethics,
collaborative filtering,
content based filtering,
pervasive computing,
ubiquitous computing,
human-computer interaction,
cloud computing,
Internet of Things,
artificial cognition,
computational creativity,
sentiment analy*,
robotics,
boltzmann machine*,
kernel machine*,
Hopfield network*,
Hebbian learning,
latent factor model*,
non-negative matrix factorization,
independent component analysis,
principal component analysis,
data augmentation,
image segmentation,
autoregressive language model*,
generative pre-trained transformer*,
smart city,
smart home,
smart grid,
smart health,
smart manufacturing,
smart agriculture,
smart environment,
smart energy,
smart mobility,
smart buildings,
smart tourism,
smart logistics,
smart supply chain,
smart retail,
smart waste management,
smart parking,
smart governance,
smart education,
smart technolog*,
smart diagnostic*,
data* analytic*,
hadoop*,
mapreduce,
map$reduce,
large$ dataset*,
data warehouse*,
predictive analytic*,
no$sql,
nosql,
no sql,
unstructured data*,
data science*

File diff suppressed because one or more lines are too long

@ -22,10 +22,7 @@
"country_mode = \"CU\" #CU-country-region AU-address"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
"collapsed": false
}
},
{
@ -36,10 +33,7 @@
"# (TS=(\"artificial intelligence\") OR TS=(\"machine learning\") OR TS=(\"neural network\") OR TS=(\"big data\") OR TS=(\"deep learning\") OR TS=(\"computer vision\") OR TS=(\"pattern recognition\")) AND"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
"collapsed": false
}
},
{
@ -66,10 +60,7 @@
"keywords_str"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
"collapsed": false
}
},
{
@ -102,10 +93,7 @@
"# eu_sub_str"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
"collapsed": false
}
},
{
@ -114,10 +102,7 @@
"outputs": [],
"source": [],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
"collapsed": false
}
},
{
@ -137,10 +122,7 @@
"coop_countries[-5:]"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
"collapsed": false
}
},
{
@ -160,10 +142,7 @@
"foc_str"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
"collapsed": false
}
},
{
@ -184,10 +163,7 @@
"scope_query"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
"collapsed": false
}
},
{
@ -208,10 +184,7 @@
"ch_scope_query"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
"collapsed": false
}
},
{
@ -232,10 +205,7 @@
"eu_scope_query"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
"collapsed": false
}
},
{
@ -246,10 +216,7 @@
"sub_queries = [f'PY=(2011-2022) AND ({i_str}) AND ({keywords_str})' for i_str in [foc_str,eu_str,assoc_str,nor_str,swi_str,uk_str]+eu_sub_str]"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
"collapsed": false
}
},
{
@ -260,10 +227,7 @@
"from wossel_miners import wos_fetch_entries,wos_fetch_yearly_output"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
"collapsed": false
}
},
{
@ -282,10 +246,7 @@
"wos_fetch_yearly_output(query_str_list=sub_queries)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
"collapsed": false
}
},
{
@ -305,10 +266,7 @@
"scope_query"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
"collapsed": false
}
},
{
@ -342,10 +300,7 @@
"wos_fetch_entries(query_str=scope_query)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
"collapsed": false
}
}
],
@ -370,4 +325,4 @@
},
"nbformat": 4,
"nbformat_minor": 0
}
}

@ -0,0 +1,294 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 65,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import os\n",
"\n",
"import pandas as pd\n",
"focal_countries_list = [\"Peoples R china\", \"Hong Kong\"]"
]
},
{
"cell_type": "code",
"execution_count": 66,
"outputs": [],
"source": [
"country_mode = \"CU\" #CU-country-region AU-address"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 67,
"outputs": [],
"source": [
"# (TS=(\"artificial intelligence\") OR TS=(\"machine learning\") OR TS=(\"neural network\") OR TS=(\"big data\") OR TS=(\"deep learning\") OR TS=(\"computer vision\") OR TS=(\"pattern recognition\")) AND"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 68,
"outputs": [],
"source": [
"keywords_source = r'..\\ai_scope_keywords.txt'\n",
"with open(keywords_source,'r') as f:\n",
" keywords = \"\\n\".join(f.readlines()).replace('\\n','')"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 69,
"outputs": [
{
"data": {
"text/plain": "'artificial intelligence*,machine* learn*,neural network*,big data*,deep learn*,pattern recognition,computer vision,image classification,reinforcement learning,support vector machine*,recommender system*,random forest,ensemble model*,image processing,generative network*,ai ethic*,natural language processing,clustering algorithm*,feature extraction,time series forecast*,anomaly detection,identity fraud detection,dimensionality reduction,feature elicitation,chatbot*,clustering,*supervised learning,convolutional network*,convolutional neural,adversarial network*,adversarial neural,adversarial machine,autoencoder*,gated recurrent unit*,perceptron*,feature learning,feature engineering,long short-term memor*,word embedding*,word vector*,gradient descent,k-nearest neighbor*,naive bayes,transfer learning,fuzzy logic,backpropagation,computational modeling,computational statistic*,intelligent agent*,expert system*,decision tree*,Bayesian network*,genetic algorithm*,swarm intelligence,cognitive computing,artificial neural network*convolutional neural network*,recurrent neural network*,ensemble learning,data mining,artificial general intelligence,artificial consciousness,evolutionary algorithm*,self-organizing map*,deep reinforcement learning,adversarial machine learning,machine vision,neural-symbolic integration,probabilistic graphical model*,hybrid intelligent system*,machine creativity,explainable AI,interactive machine learning,artificial emotional intelligence,evolutionary computation*,human-in-the-loop,unsupervised deep learning,deep belief network*,quantum machine learning,artificial immune system*,swarm robotics,autonomous agents,machine ethics,collaborative filtering,content based filtering,pervasive computing,ubiquitous computing,human-computer interaction,cloud computing,Internet of Things,artificial cognition,computational creativity,sentiment analy*,robotics,boltzmann machine*,kernel machine*,Hopfield network*,Hebbian learning,latent factor model*,non-negative matrix factorization,independent component analysis,principal component analysis,data augmentation,image segmentation,autoregressive language model*,generative pre-trained transformer*,smart city,smart home,smart grid,smart health,smart manufacturing,smart agriculture,smart environment,smart energy,smart mobility,smart buildings,smart tourism,smart logistics,smart supply chain,smart retail,smart waste management,smart parking,smart governance,smart education,smart technolog*,smart diagnostic*,data* analytic*,hadoop*,mapreduce,map$reduce,large$ dataset*,data warehouse*,predictive analytic*,no$sql,nosql,no sql,unstructured data*,data science*'"
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"keywords"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 70,
"outputs": [
{
"data": {
"text/plain": "'\"artificial intelligence*\" OR \"machine* learn*\" OR \"neural network*\" OR \"big data*\" OR \"deep learn*\" OR \"pattern recognition\" OR \"computer vision\" OR \"image classification\" OR \"reinforcement learning\" OR \"support vector machine*\" OR \"recommender system*\" OR \"random forest\" OR \"ensemble model*\" OR \"image processing\" OR \"generative network*\" OR \"ai ethic*\" OR \"natural language processing\" OR \"clustering algorithm*\" OR \"feature extraction\" OR \"time series forecast*\" OR \"anomaly detection\" OR \"identity fraud detection\" OR \"dimensionality reduction\" OR \"feature elicitation\" OR \"chatbot*\" OR \"clustering\" OR \"*supervised learning\" OR \"convolutional network*\" OR \"convolutional neural\" OR \"adversarial network*\" OR \"adversarial neural\" OR \"adversarial machine\" OR \"autoencoder*\" OR \"gated recurrent unit*\" OR \"perceptron*\" OR \"feature learning\" OR \"feature engineering\" OR \"long short-term memor*\" OR \"word embedding*\" OR \"word vector*\" OR \"gradient descent\" OR \"k-nearest neighbor*\" OR \"naive bayes\" OR \"transfer learning\" OR \"fuzzy logic\" OR \"backpropagation\" OR \"computational modeling\" OR \"computational statistic*\" OR \"intelligent agent*\" OR \"expert system*\" OR \"decision tree*\" OR \"Bayesian network*\" OR \"genetic algorithm*\" OR \"swarm intelligence\" OR \"cognitive computing\" OR \"artificial neural network*convolutional neural network*\" OR \"recurrent neural network*\" OR \"ensemble learning\" OR \"data mining\" OR \"artificial general intelligence\" OR \"artificial consciousness\" OR \"evolutionary algorithm*\" OR \"self-organizing map*\" OR \"deep reinforcement learning\" OR \"adversarial machine learning\" OR \"machine vision\" OR \"neural-symbolic integration\" OR \"probabilistic graphical model*\" OR \"hybrid intelligent system*\" OR \"machine creativity\" OR \"explainable AI\" OR \"interactive machine learning\" OR \"artificial emotional intelligence\" OR \"evolutionary computation*\" OR \"human-in-the-loop\" OR \"unsupervised deep learning\" OR \"deep belief network*\" OR \"quantum machine learning\" OR \"artificial immune system*\" OR \"swarm robotics\" OR \"autonomous agents\" OR \"machine ethics\" OR \"collaborative filtering\" OR \"content based filtering\" OR \"pervasive computing\" OR \"ubiquitous computing\" OR \"human-computer interaction\" OR \"cloud computing\" OR \"Internet of Things\" OR \"artificial cognition\" OR \"computational creativity\" OR \"sentiment analy*\" OR \"robotics\" OR \"boltzmann machine*\" OR \"kernel machine*\" OR \"Hopfield network*\" OR \"Hebbian learning\" OR \"latent factor model*\" OR \"non-negative matrix factorization\" OR \"independent component analysis\" OR \"principal component analysis\" OR \"data augmentation\" OR \"image segmentation\" OR \"autoregressive language model*\" OR \"generative pre-trained transformer*\" OR \"smart city\" OR \"smart home\" OR \"smart grid\" OR \"smart health\" OR \"smart manufacturing\" OR \"smart agriculture\" OR \"smart environment\" OR \"smart energy\" OR \"smart mobility\" OR \"smart buildings\" OR \"smart tourism\" OR \"smart logistics\" OR \"smart supply chain\" OR \"smart retail\" OR \"smart waste management\" OR \"smart parking\" OR \"smart governance\" OR \"smart education\" OR \"smart technolog*\" OR \"smart diagnostic*\" OR \"data* analytic*\" OR \"hadoop*\" OR \"mapreduce\" OR \"map$reduce\" OR \"large$ dataset*\" OR \"data warehouse*\" OR \"predictive analytic*\" OR \"no$sql\" OR \"nosql\" OR \"no sql\" OR \"unstructured data*\" OR \"data science*\"'"
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"keywords = [c.strip() for c in keywords.split(\",\")]\n",
"\n",
"keywords_str = ' OR '.join('\\\"'+k+'\\\"' for k in keywords)\n",
"keywords_str"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 70,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 71,
"outputs": [],
"source": [
"scope_country_source = r'..\\eu_scope_countries.txt'\n",
"\n",
"with open(scope_country_source,'r') as f:\n",
" coop_countries = f.readlines()\n",
"coop_countries = [c.strip().upper() for c in coop_countries[0].split(\",\")]\n",
"focal_countries = [c.strip().upper() for c in focal_countries_list]\n",
"eu_countries = coop_countries[0:-7]\n",
"assoc_countries = coop_countries[-7:]\n",
"\n",
"nor_c = [coop_countries[-7],]\n",
"swi_c = [coop_countries[-6],]\n",
"uk_c = coop_countries[-5:]\n",
"\n",
"foc_str = ' OR '.join([c for c in focal_countries])\n",
"coop_str = ' OR '.join([c for c in coop_countries])\n",
"eu_str = ' OR '.join([c for c in eu_countries])\n",
"assoc_str = ' OR '.join([c for c in assoc_countries])\n",
"\n",
"nor_str =' OR '.join([c for c in nor_c])\n",
"swi_str =' OR '.join([c for c in swi_c])\n",
"uk_str =' OR '.join([c for c in uk_c])\n",
"eu_sub_str = eu_str.split(' OR ')\n",
"# eu_sub_str"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 72,
"outputs": [
{
"data": {
"text/plain": "'AUSTRIA OR BELGIUM OR BULGARIA OR CROATIA OR CYPRUS OR CZECH REPUBLIC OR DENMARK OR ESTONIA OR FINLAND OR FRANCE OR GERMANY OR GREECE OR HUNGARY OR IRELAND OR ITALY OR LATVIA OR LITHUANIA OR LUXEMBOURG OR MALTA OR NETHERLANDS OR POLAND OR PORTUGAL OR ROMANIA OR SLOVAKIA OR SLOVENIA OR SPAIN OR SWEDEN OR NORWAY OR SWITZERLAND OR UNITED KINGDOM OR ENGLAND OR WALES OR SCOTLAND OR N IRELAND'"
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"eu_assoc = ' OR '.join([eu_str,nor_str,swi_str,uk_str])\n",
"eu_assoc"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 73,
"outputs": [
{
"data": {
"text/plain": "'PEOPLES R CHINA OR HONG KONG'"
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"foc_str"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 73,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 73,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 74,
"outputs": [
{
"data": {
"text/plain": "'CU=(PEOPLES R CHINA OR HONG KONG) AND CU=(AUSTRIA OR BELGIUM OR BULGARIA OR CROATIA OR CYPRUS OR CZECH REPUBLIC OR DENMARK OR ESTONIA OR FINLAND OR FRANCE OR GERMANY OR GREECE OR HUNGARY OR IRELAND OR ITALY OR LATVIA OR LITHUANIA OR LUXEMBOURG OR MALTA OR NETHERLANDS OR POLAND OR PORTUGAL OR ROMANIA OR SLOVAKIA OR SLOVENIA OR SPAIN OR SWEDEN OR NORWAY OR SWITZERLAND OR UNITED KINGDOM OR ENGLAND OR WALES OR SCOTLAND OR N IRELAND) AND TS=(\"artificial intelligence*\" OR \"machine* learn*\" OR \"neural network*\" OR \"big data*\" OR \"deep learn*\" OR \"pattern recognition\" OR \"computer vision\" OR \"image classification\" OR \"reinforcement learning\" OR \"support vector machine*\" OR \"recommender system*\" OR \"random forest\" OR \"ensemble model*\" OR \"image processing\" OR \"generative network*\" OR \"ai ethic*\" OR \"natural language processing\" OR \"clustering algorithm*\" OR \"feature extraction\" OR \"time series forecast*\" OR \"anomaly detection\" OR \"identity fraud detection\" OR \"dimensionality reduction\" OR \"feature elicitation\" OR \"chatbot*\" OR \"clustering\" OR \"*supervised learning\" OR \"convolutional network*\" OR \"convolutional neural\" OR \"adversarial network*\" OR \"adversarial neural\" OR \"adversarial machine\" OR \"autoencoder*\" OR \"gated recurrent unit*\" OR \"perceptron*\" OR \"feature learning\" OR \"feature engineering\" OR \"long short-term memor*\" OR \"word embedding*\" OR \"word vector*\" OR \"gradient descent\" OR \"k-nearest neighbor*\" OR \"naive bayes\" OR \"transfer learning\" OR \"fuzzy logic\" OR \"backpropagation\" OR \"computational modeling\" OR \"computational statistic*\" OR \"intelligent agent*\" OR \"expert system*\" OR \"decision tree*\" OR \"Bayesian network*\" OR \"genetic algorithm*\" OR \"swarm intelligence\" OR \"cognitive computing\" OR \"artificial neural network*convolutional neural network*\" OR \"recurrent neural network*\" OR \"ensemble learning\" OR \"data mining\" OR \"artificial general intelligence\" OR \"artificial consciousness\" OR \"evolutionary algorithm*\" OR \"self-organizing map*\" OR \"deep reinforcement learning\" OR \"adversarial machine learning\" OR \"machine vision\" OR \"neural-symbolic integration\" OR \"probabilistic graphical model*\" OR \"hybrid intelligent system*\" OR \"machine creativity\" OR \"explainable AI\" OR \"interactive machine learning\" OR \"artificial emotional intelligence\" OR \"evolutionary computation*\" OR \"human-in-the-loop\" OR \"unsupervised deep learning\" OR \"deep belief network*\" OR \"quantum machine learning\" OR \"artificial immune system*\" OR \"swarm robotics\" OR \"autonomous agents\" OR \"machine ethics\" OR \"collaborative filtering\" OR \"content based filtering\" OR \"pervasive computing\" OR \"ubiquitous computing\" OR \"human-computer interaction\" OR \"cloud computing\" OR \"Internet of Things\" OR \"artificial cognition\" OR \"computational creativity\" OR \"sentiment analy*\" OR \"robotics\" OR \"boltzmann machine*\" OR \"kernel machine*\" OR \"Hopfield network*\" OR \"Hebbian learning\" OR \"latent factor model*\" OR \"non-negative matrix factorization\" OR \"independent component analysis\" OR \"principal component analysis\" OR \"data augmentation\" OR \"image segmentation\" OR \"autoregressive language model*\" OR \"generative pre-trained transformer*\" OR \"smart city\" OR \"smart home\" OR \"smart grid\" OR \"smart health\" OR \"smart manufacturing\" OR \"smart agriculture\" OR \"smart environment\" OR \"smart energy\" OR \"smart mobility\" OR \"smart buildings\" OR \"smart tourism\" OR \"smart logistics\" OR \"smart supply chain\" OR \"smart retail\" OR \"smart waste management\" OR \"smart parking\" OR \"smart governance\" OR \"smart education\" OR \"smart technolog*\" OR \"smart diagnostic*\" OR \"data* analytic*\" OR \"hadoop*\" OR \"mapreduce\" OR \"map$reduce\" OR \"large$ dataset*\" OR \"data warehouse*\" OR \"predictive analytic*\" OR \"no$sql\" OR \"nosql\" OR \"no sql\" OR \"unstructured data*\" OR \"data science*\") AND PY=(2011-2022)'"
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"scope_query = f'CU=({foc_str}) AND CU=({eu_assoc}) AND TS=({keywords_str}) AND PY=(2011-2022)'\n",
"scope_query"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 74,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 75,
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'pytest'",
"output_type": "error",
"traceback": [
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[1;31mModuleNotFoundError\u001B[0m Traceback (most recent call last)",
"Cell \u001B[1;32mIn[75], line 1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mwossel_miners\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m wos_fetch_entries,wos_fetch_yearly_output\n",
"File \u001B[1;32m~\\PycharmProjects\\ZSI_analytics\\WOS\\wos_extract\\wossel_miners.py:3\u001B[0m\n\u001B[0;32m 1\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mos\u001B[39;00m\n\u001B[0;32m 2\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mglob\u001B[39;00m\n\u001B[1;32m----> 3\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mpytest\u001B[39;00m\n\u001B[0;32m 4\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mtime\u001B[39;00m\n\u001B[0;32m 5\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mdatetime\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m datetime\n",
"\u001B[1;31mModuleNotFoundError\u001B[0m: No module named 'pytest'"
]
}
],
"source": [
"from wossel_miners import wos_fetch_entries,wos_fetch_yearly_output"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# wos_fetch_yearly_output(query_str_list=sub_queries)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# wos_fetch_entries(query_str=scope_query)"
],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

BIN
WOS/wos_extract/wos_records_concat.csv (Stored with Git LFS)

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:1b4488a74c6abb033610f3d72af50b4d4aa1f0d455b0fa7399161d8a11cdf085
3 size 126669330

@ -0,0 +1,188 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import os\n",
"import shutil\n",
"import re\n",
"import spacy"
]
},
{
"cell_type": "code",
"execution_count": 19,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Concatting records for query:\n",
"\n",
"['(CU=PEOPLES R CHINA OR CU=HONG KONG) AND (CU=AUSTRIA OR CU=BELGIUM OR CU=BULGARIA OR CU=CROATIA OR CU=CYPRUS OR CU=CZECH REPUBLIC OR CU=DENMARK OR CU=ESTONIA OR CU=FINLAND OR CU=FRANCE OR CU=GERMANY OR CU=GREECE OR CU=HUNGARY OR CU=IRELAND OR CU=ITALY OR CU=LATVIA OR CU=LITHUANIA OR CU=LUXEMBOURG OR CU=MALTA OR CU=NETHERLANDS OR CU=POLAND OR CU=PORTUGAL OR CU=ROMANIA OR CU=SLOVAKIA OR CU=SLOVENIA OR CU=SPAIN OR CU=SWEDEN OR CU=NORWAY OR CU=SWITZERLAND OR CU=UNITED KINGDOM OR CU=ENGLAND OR CU=WALES OR CU=SCOTLAND OR CU=N IRELAND) AND (TS=(\"artificial intelligence\") OR TS=(\"machine learning\") OR TS=(\"neural network\") OR TS=(\"big data\") OR TS=(\"deep learning\") OR TS=(\"pattern recognition\") OR TS=(\"computer vision\") OR TS=(\"image classification\") OR TS=(\"reinforcement learning\") OR TS=(\"support vector machines\") OR TS=(\"recommender system\") OR TS=(\"random forest\") OR TS=(\"ensemble model\") OR TS=(\"image processing\") OR TS=(\"generative network\") OR TS=(\"ai ethic\") OR TS=(\"natural language processing\") OR TS=(\"clustering algorithm\") OR TS=(\"feature extraction\") OR TS=(\"time series forecast\") OR TS=(\"anomaly detection\") OR TS=(\"identity fraud detection\") OR TS=(\"dimensionality reduction\") OR TS=(\"feature elicitation\") OR TS=(\"chatbot\") OR TS=(\"clustering\") OR TS=(\"unsupervised learning\") OR TS=(\"supervised learning\") OR TS=(\"convolutional network\") OR TS=(\"adversarial network\")) AND PY=(2011-2022)']\n"
]
}
],
"source": [
"folder_token=\"2023-04-04-12-58-59-994722save\"\n",
"workdir_path=fr\"wos_downloads/entry_batches/{folder_token}\"\n",
"outfile='wos_records_concat.csv'\n",
"try:\n",
" os.remove(outfile)\n",
"except FileNotFoundError:\n",
" pass\n",
"with_header=True\n",
"for root, dirs, files in os.walk(workdir_path):\n",
" for filename in files:\n",
" path=os.path.join(root, filename)\n",
" if filename.startswith(\"records_\"):\n",
" chunk = pd.read_csv(path, sep=\"\\t\")\n",
" chunk.to_csv(outfile, mode=\"a\", index=False, header=with_header, sep=\"\\t\")\n",
" with_header = False\n",
" elif filename.startswith(\"query\"):\n",
" with open(path,\"r\") as f:\n",
" q=f.readlines()\n",
" print(\"Concatting records for query:\\n\")\n",
" print(q)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 20,
"outputs": [],
"source": [
"# df_pre = pd.read_excel(r\"C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\WOS\\wos_extract\\v1_\\wosexport1.xls\")\n",
"# list(df_pre.columns[:-1])"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 21,
"outputs": [],
"source": [
"col_vals = ['Publication Type',\n",
" 'Authors',\n",
" 'Book Authors',\n",
" 'Book Editors',\n",
" 'Book Group Authors',\n",
" 'Author Full Names',\n",
" 'Book Author Full Names',\n",
" 'Group Authors',\n",
" 'Article Title',\n",
" 'Source Title',\n",
" 'Book Series Title',\n",
" 'Book Series Subtitle',\n",
" 'Language',\n",
" 'Document Type',\n",
" 'Conference Title',\n",
" 'Conference Date',\n",
" 'Conference Location',\n",
" 'Conference Sponsor',\n",
" 'Conference Host',\n",
" 'Author Keywords',\n",
" 'Keywords Plus',\n",
" 'Abstract',\n",
" 'Addresses',\n",
" 'Affiliations',\n",
" 'Reprint Addresses',\n",
" 'Email Addresses',\n",
" 'Researcher Ids',\n",
" 'ORCIDs',\n",
" 'Funding Orgs',\n",
" 'Funding Name Preferred',\n",
" 'Funding Text',\n",
" 'Cited References',\n",
" 'Cited Reference Count',\n",
" 'Times Cited, WoS Core',\n",
" 'Times Cited, All Databases',\n",
" '180 Day Usage Count',\n",
" 'Since 2013 Usage Count',\n",
" 'Publisher',\n",
" 'Publisher City',\n",
" 'Publisher Address',\n",
" 'ISSN',\n",
" 'eISSN',\n",
" 'ISBN',\n",
" 'Journal Abbreviation',\n",
" 'Journal ISO Abbreviation',\n",
" 'Publication Date',\n",
" 'Publication Year',\n",
" 'Volume',\n",
" 'Issue',\n",
" 'Part Number',\n",
" 'Supplement',\n",
" 'Special Issue',\n",
" 'Meeting Abstract',\n",
" 'Start Page',\n",
" 'End Page',\n",
" 'Article Number',\n",
" 'DOI',\n",
" 'DOI Link',\n",
" 'Book DOI',\n",
" 'Early Access Date',\n",
" 'Number of Pages',\n",
" 'WoS Categories',\n",
" 'Web of Science Index',\n",
" 'Research Areas',\n",
" 'IDS Number',\n",
" 'Pubmed Id',\n",
" 'Open Access Designations',\n",
" 'Highly Cited Status',\n",
" 'Hot Paper Status',\n",
" 'Date of Export',\n",
" 'UT (Unique WOS ID)']"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 22,
"outputs": [],
"source": [
"df = pd.read_csv(outfile, sep=\"\\t\",low_memory=False)\n",
"df.columns = col_vals\n",
"# df\n",
"df.to_csv(outfile, index=False, header=True, sep=\"\\t\")"
],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

File diff suppressed because one or more lines are too long
Loading…
Cancel
Save