You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
blabla/WOS/wos_extract/wos_query_generator_legacy....

328 lines
13 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import os\n",
"\n",
"import pandas as pd\n",
"focal_countries_list = [\"Peoples R china\", \"Hong Kong\"]"
]
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [],
"source": [
"country_mode = \"CU\" #CU-country-region AU-address"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 3,
"outputs": [],
"source": [
"# (TS=(\"artificial intelligence\") OR TS=(\"machine learning\") OR TS=(\"neural network\") OR TS=(\"big data\") OR TS=(\"deep learning\") OR TS=(\"computer vision\") OR TS=(\"pattern recognition\")) AND"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 4,
"outputs": [
{
"data": {
"text/plain": "'TS=(\"artificial intelligence\") OR TS=(\"machine learning\") OR TS=(\"neural network\") OR TS=(\"big data\") OR TS=(\"deep learning\") OR TS=(\"pattern recognition\") OR TS=(\"computer vision\") OR TS=(\"image classification\") OR TS=(\"reinforcement learning\") OR TS=(\"support vector machines\") OR TS=(\"recommender system\") OR TS=(\"random forest\") OR TS=(\"ensemble model\") OR TS=(\"image processing\") OR TS=(\"generative network\") OR TS=(\"ai ethic\") OR TS=(\"natural language processing\") OR TS=(\"clustering algorithm\") OR TS=(\"feature extraction\") OR TS=(\"time series forecast\") OR TS=(\"anomaly detection\") OR TS=(\"identity fraud detection\") OR TS=(\"dimensionality reduction\") OR TS=(\"feature elicitation\") OR TS=(\"chatbot\") OR TS=(\"clustering\") OR TS=(\"unsupervised learning\") OR TS=(\"supervised learning\") OR TS=(\"convolutional network\") OR TS=(\"adversarial network\")'"
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"keywords_source = r'..\\ai_scope_keywords.txt'\n",
"with open(keywords_source,'r') as f:\n",
" keywords = f.readlines()\n",
"\n",
"keywords = [c.strip() for c in keywords[0].split(\",\")]\n",
"\n",
"keywords_str = ' OR '.join('TS=(\\\"'+k+'\\\")' for k in keywords)\n",
"keywords_str"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 5,
"outputs": [],
"source": [
"scope_country_source = r'..\\eu_scope_countries.txt'\n",
"\n",
"with open(scope_country_source,'r') as f:\n",
" coop_countries = f.readlines()\n",
"coop_countries = [c.strip().upper() for c in coop_countries[0].split(\",\")]\n",
"focal_countries = [c.strip().upper() for c in focal_countries_list]\n",
"eu_countries = coop_countries[0:-7]\n",
"assoc_countries = coop_countries[-7:]\n",
"\n",
"nor_c = [coop_countries[-7],]\n",
"swi_c = [coop_countries[-6],]\n",
"uk_c = coop_countries[-5:]\n",
"\n",
"foc_str = ' OR '.join([country_mode+'='+c for c in focal_countries])\n",
"coop_str = ' OR '.join([country_mode+'='+c for c in coop_countries])\n",
"eu_str = ' OR '.join([country_mode+'='+c for c in eu_countries])\n",
"assoc_str = ' OR '.join([country_mode+'='+c for c in assoc_countries])\n",
"\n",
"nor_str =' OR '.join([country_mode+'='+c for c in nor_c])\n",
"swi_str =' OR '.join([country_mode+'='+c for c in swi_c])\n",
"uk_str =' OR '.join([country_mode+'='+c for c in uk_c])\n",
"eu_sub_str = eu_str.split(' OR ')\n",
"# eu_sub_str"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 5,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 6,
"outputs": [
{
"data": {
"text/plain": "['UNITED KINGDOM', 'ENGLAND', 'WALES', 'SCOTLAND', 'N IRELAND']"
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"coop_countries[-5:]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 7,
"outputs": [
{
"data": {
"text/plain": "'CU=PEOPLES R CHINA OR CU=HONG KONG'"
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"foc_str"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 8,
"outputs": [
{
"data": {
"text/plain": "'(CU=PEOPLES R CHINA OR CU=HONG KONG) AND (CU=AUSTRIA OR CU=BELGIUM OR CU=BULGARIA OR CU=CROATIA OR CU=CYPRUS OR CU=CZECH REPUBLIC OR CU=DENMARK OR CU=ESTONIA OR CU=FINLAND OR CU=FRANCE OR CU=GERMANY OR CU=GREECE OR CU=HUNGARY OR CU=IRELAND OR CU=ITALY OR CU=LATVIA OR CU=LITHUANIA OR CU=LUXEMBOURG OR CU=MALTA OR CU=NETHERLANDS OR CU=POLAND OR CU=PORTUGAL OR CU=ROMANIA OR CU=SLOVAKIA OR CU=SLOVENIA OR CU=SPAIN OR CU=SWEDEN OR CU=NORWAY OR CU=SWITZERLAND OR CU=UNITED KINGDOM OR CU=ENGLAND OR CU=WALES OR CU=SCOTLAND OR CU=N IRELAND) AND (TS=(\"artificial intelligence\") OR TS=(\"machine learning\") OR TS=(\"neural network\") OR TS=(\"big data\") OR TS=(\"deep learning\") OR TS=(\"pattern recognition\") OR TS=(\"computer vision\") OR TS=(\"image classification\") OR TS=(\"reinforcement learning\") OR TS=(\"support vector machines\") OR TS=(\"recommender system\") OR TS=(\"random forest\") OR TS=(\"ensemble model\") OR TS=(\"image processing\") OR TS=(\"generative network\") OR TS=(\"ai ethic\") OR TS=(\"natural language processing\") OR TS=(\"clustering algorithm\") OR TS=(\"feature extraction\") OR TS=(\"time series forecast\") OR TS=(\"anomaly detection\") OR TS=(\"identity fraud detection\") OR TS=(\"dimensionality reduction\") OR TS=(\"feature elicitation\") OR TS=(\"chatbot\") OR TS=(\"clustering\") OR TS=(\"unsupervised learning\") OR TS=(\"supervised learning\") OR TS=(\"convolutional network\") OR TS=(\"adversarial network\")) AND PY=(2011-2022)'"
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"scope_query = f'({foc_str}) AND ({coop_str}) AND ({keywords_str}) AND PY=(2011-2022)'\n",
"scope_query"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 9,
"outputs": [
{
"data": {
"text/plain": "'(CU=PEOPLES R CHINA OR CU=HONG KONG) AND (TS=(\"artificial intelligence\") OR TS=(\"machine learning\") OR TS=(\"neural network\") OR TS=(\"big data\") OR TS=(\"deep learning\") OR TS=(\"pattern recognition\") OR TS=(\"computer vision\") OR TS=(\"image classification\") OR TS=(\"reinforcement learning\") OR TS=(\"support vector machines\") OR TS=(\"recommender system\") OR TS=(\"random forest\") OR TS=(\"ensemble model\") OR TS=(\"image processing\") OR TS=(\"generative network\") OR TS=(\"ai ethic\") OR TS=(\"natural language processing\") OR TS=(\"clustering algorithm\") OR TS=(\"feature extraction\") OR TS=(\"time series forecast\") OR TS=(\"anomaly detection\") OR TS=(\"identity fraud detection\") OR TS=(\"dimensionality reduction\") OR TS=(\"feature elicitation\") OR TS=(\"chatbot\") OR TS=(\"clustering\") OR TS=(\"unsupervised learning\") OR TS=(\"supervised learning\") OR TS=(\"convolutional network\") OR TS=(\"adversarial network\"))'"
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ch_scope_query = f'({foc_str}) AND ({keywords_str})'\n",
"ch_scope_query"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 10,
"outputs": [
{
"data": {
"text/plain": "'(CU=AUSTRIA OR CU=BELGIUM OR CU=BULGARIA OR CU=CROATIA OR CU=CYPRUS OR CU=CZECH REPUBLIC OR CU=DENMARK OR CU=ESTONIA OR CU=FINLAND OR CU=FRANCE OR CU=GERMANY OR CU=GREECE OR CU=HUNGARY OR CU=IRELAND OR CU=ITALY OR CU=LATVIA OR CU=LITHUANIA OR CU=LUXEMBOURG OR CU=MALTA OR CU=NETHERLANDS OR CU=POLAND OR CU=PORTUGAL OR CU=ROMANIA OR CU=SLOVAKIA OR CU=SLOVENIA OR CU=SPAIN OR CU=SWEDEN) AND (TS=(\"artificial intelligence\") OR TS=(\"machine learning\") OR TS=(\"neural network\") OR TS=(\"big data\") OR TS=(\"deep learning\") OR TS=(\"pattern recognition\") OR TS=(\"computer vision\") OR TS=(\"image classification\") OR TS=(\"reinforcement learning\") OR TS=(\"support vector machines\") OR TS=(\"recommender system\") OR TS=(\"random forest\") OR TS=(\"ensemble model\") OR TS=(\"image processing\") OR TS=(\"generative network\") OR TS=(\"ai ethic\") OR TS=(\"natural language processing\") OR TS=(\"clustering algorithm\") OR TS=(\"feature extraction\") OR TS=(\"time series forecast\") OR TS=(\"anomaly detection\") OR TS=(\"identity fraud detection\") OR TS=(\"dimensionality reduction\") OR TS=(\"feature elicitation\") OR TS=(\"chatbot\") OR TS=(\"clustering\") OR TS=(\"unsupervised learning\") OR TS=(\"supervised learning\") OR TS=(\"convolutional network\") OR TS=(\"adversarial network\"))'"
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"eu_scope_query = f'({eu_str}) AND ({keywords_str})'\n",
"eu_scope_query"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 11,
"outputs": [],
"source": [
"sub_queries = [f'PY=(2011-2022) AND ({i_str}) AND ({keywords_str})' for i_str in [foc_str,eu_str,assoc_str,nor_str,swi_str,uk_str]+eu_sub_str]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [],
"source": [
"from wossel_miners import wos_fetch_entries,wos_fetch_yearly_output"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 13,
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 33/33 [12:49<00:00, 23.31s/it]\n"
]
}
],
"source": [
"wos_fetch_yearly_output(query_str_list=sub_queries)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 14,
"outputs": [
{
"data": {
"text/plain": "'(CU=PEOPLES R CHINA OR CU=HONG KONG) AND (CU=AUSTRIA OR CU=BELGIUM OR CU=BULGARIA OR CU=CROATIA OR CU=CYPRUS OR CU=CZECH REPUBLIC OR CU=DENMARK OR CU=ESTONIA OR CU=FINLAND OR CU=FRANCE OR CU=GERMANY OR CU=GREECE OR CU=HUNGARY OR CU=IRELAND OR CU=ITALY OR CU=LATVIA OR CU=LITHUANIA OR CU=LUXEMBOURG OR CU=MALTA OR CU=NETHERLANDS OR CU=POLAND OR CU=PORTUGAL OR CU=ROMANIA OR CU=SLOVAKIA OR CU=SLOVENIA OR CU=SPAIN OR CU=SWEDEN OR CU=NORWAY OR CU=SWITZERLAND OR CU=UNITED KINGDOM OR CU=ENGLAND OR CU=WALES OR CU=SCOTLAND OR CU=N IRELAND) AND (TS=(\"artificial intelligence\") OR TS=(\"machine learning\") OR TS=(\"neural network\") OR TS=(\"big data\") OR TS=(\"deep learning\") OR TS=(\"pattern recognition\") OR TS=(\"computer vision\") OR TS=(\"image classification\") OR TS=(\"reinforcement learning\") OR TS=(\"support vector machines\") OR TS=(\"recommender system\") OR TS=(\"random forest\") OR TS=(\"ensemble model\") OR TS=(\"image processing\") OR TS=(\"generative network\") OR TS=(\"ai ethic\") OR TS=(\"natural language processing\") OR TS=(\"clustering algorithm\") OR TS=(\"feature extraction\") OR TS=(\"time series forecast\") OR TS=(\"anomaly detection\") OR TS=(\"identity fraud detection\") OR TS=(\"dimensionality reduction\") OR TS=(\"feature elicitation\") OR TS=(\"chatbot\") OR TS=(\"clustering\") OR TS=(\"unsupervised learning\") OR TS=(\"supervised learning\") OR TS=(\"convolutional network\") OR TS=(\"adversarial network\")) AND PY=(2011-2022)'"
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"scope_query"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 16,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Hoooold...\n",
"27672 records found! Here we go in 93 steps...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 92/92 [09:38<00:00, 6.29s/it]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"final batch of 27601-27672\n"
]
}
],
"source": [
"wos_fetch_entries(query_str=scope_query)"
],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}