utku_keyword_suggestion
radvanyimome 1 year ago
parent 230e00d57a
commit edf23fbcda

@ -31,6 +31,50 @@
} }
} }
}, },
{
"cell_type": "markdown",
"source": [
"# WOS current query:\n",
"\n",
"\"\"\"(TS=(\"artificial intelligence\") OR TS=(\"machine learning\") OR TS=(\"neural network\") OR TS=(\"big data\") OR TS=(\"deep learning\")) AND (CU=PEOPLES R CHINA AND (CU=AUSTRIA OR CU=BELGIUM OR CU=BULGARIA OR CU=CROATIA OR CU=REPUBLIC OF CYPRUS OR CU=CZECH REPUBLIC OR CU=DENMARK OR CU=ESTONIA OR CU=FINLAND OR CU=FRANCE OR CU=GERMANY OR CU=GREECE OR CU=HUNGARY OR CU=IRELAND OR CU=ITALY OR CU=LATVIA OR CU=LITHUANIA OR CU=LUXEMBOURG OR CU=MALTA OR CU=NETHERLANDS OR CU=POLAND OR CU=PORTUGAL OR CU=ROMANIA OR CU=SLOVAKIA OR CU=SLOVENIA OR CU=SPAIN OR CU=SWEDEN))\"\"\""
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"other keywords: pattern recognition, computer vision, image classification, reinforcement learning, support vector machines, recommender system, random forest, ensemble models, image processing, generative network, ai ethic, natural language processing, clustering algorithm, feature extraction, time series forecast, anomaly detection, identity fraud detection, dimensionality reduction, feature elicitation, chatbot, clustering, unsupervised learning, supervised learning, convolutional network, adversarial network\n",
"\n",
"# AI ETHICS keyword!!!"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# Only CPC classification? Or some basic PTC? (ASEAN analysis had some)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 1,
@ -73,7 +117,12 @@
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"outputs": [], "outputs": [],
"source": [], "source": [
"# Baseline of co-publications\n",
"#\n",
"# Use address instead of CU?\n",
"# plus countries UK Norway Switzerland | Turkey Serbia"
],
"metadata": { "metadata": {
"collapsed": false, "collapsed": false,
"pycharm": { "pycharm": {

@ -0,0 +1 @@
artificial intelligence,machine learning,neural network,big data,deep learning,pattern recognition,computer vision, image classification, reinforcement learning, support vector machines, recommender system, random forest, ensemble model, image processing, generative network, ai ethic, natural language processing, clustering algorithm, feature extraction, time series forecast, anomaly detection, identity fraud detection, dimensionality reduction, feature elicitation, chatbot, clustering, unsupervised learning, supervised learning, convolutional network, adversarial network

@ -0,0 +1 @@
Austria, Belgium, Bulgaria, Croatia, Cyprus, Czech Republic, Denmark, Estonia, Finland, France, Germany, Greece, Hungary, Ireland, Italy, Latvia, Lithuania, Luxembourg, Malta, Netherlands, Poland, Portugal, Romania, Slovakia, Slovenia, Spain, Sweden, Norway, Switzerland, United Kingdom, England, Wales, Scotland, N Ireland

File diff suppressed because one or more lines are too long

@ -0,0 +1,199 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"focal_countries_list = [\"Peoples R china\", \"Hong Kong\"]"
]
},
{
"cell_type": "code",
"execution_count": 51,
"outputs": [],
"source": [
"country_mode = \"CU\" #CU-country-region AU-address"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 52,
"outputs": [],
"source": [
"# (TS=(\"artificial intelligence\") OR TS=(\"machine learning\") OR TS=(\"neural network\") OR TS=(\"big data\") OR TS=(\"deep learning\") OR TS=(\"computer vision\") OR TS=(\"pattern recognition\")) AND"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 53,
"outputs": [
{
"data": {
"text/plain": "'TS=(\"artificial intelligence\") OR TS=(\"machine learning\") OR TS=(\"neural network\") OR TS=(\"big data\") OR TS=(\"deep learning\") OR TS=(\"pattern recognition\") OR TS=(\"computer vision\") OR TS=(\"image classification\") OR TS=(\"reinforcement learning\") OR TS=(\"support vector machines\") OR TS=(\"recommender system\") OR TS=(\"random forest\") OR TS=(\"ensemble model\") OR TS=(\"image processing\") OR TS=(\"generative network\") OR TS=(\"ai ethic\") OR TS=(\"natural language processing\") OR TS=(\"clustering algorithm\") OR TS=(\"feature extraction\") OR TS=(\"time series forecast\") OR TS=(\"anomaly detection\") OR TS=(\"identity fraud detection\") OR TS=(\"dimensionality reduction\") OR TS=(\"feature elicitation\") OR TS=(\"chatbot\") OR TS=(\"clustering\") OR TS=(\"unsupervised learning\") OR TS=(\"supervised learning\") OR TS=(\"convolutional network\") OR TS=(\"adversarial network\")'"
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"keywords_source = r'..\\ai_scope_keywords.txt'\n",
"with open(keywords_source,'r') as f:\n",
" keywords = f.readlines()\n",
"\n",
"keywords = [c.strip() for c in keywords[0].split(\",\")]\n",
"\n",
"keywords_str = ' OR '.join('TS=(\"'+k+'\")' for k in keywords)\n",
"keywords_str"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 54,
"outputs": [
{
"data": {
"text/plain": "'CU=AUSTRIA OR CU=BELGIUM OR CU=BULGARIA OR CU=CROATIA OR CU=CYPRUS OR CU=CZECH REPUBLIC OR CU=DENMARK OR CU=ESTONIA OR CU=FINLAND OR CU=FRANCE OR CU=GERMANY OR CU=GREECE OR CU=HUNGARY OR CU=IRELAND OR CU=ITALY OR CU=LATVIA OR CU=LITHUANIA OR CU=LUXEMBOURG OR CU=MALTA OR CU=NETHERLANDS OR CU=POLAND OR CU=PORTUGAL OR CU=ROMANIA OR CU=SLOVAKIA OR CU=SLOVENIA OR CU=SPAIN OR CU=SWEDEN OR CU=NORWAY OR CU=SWITZERLAND OR CU=UNITED KINGDOM OR CU=ENGLAND OR CU=WALES OR CU=SCOTLAND OR CU=N IRELAND'"
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"scope_country_source = r'..\\eu_scope_countries.txt'\n",
"\n",
"with open(scope_country_source,'r') as f:\n",
" coop_countries = f.readlines()\n",
"coop_countries = [c.strip().upper() for c in coop_countries[0].split(\",\")]\n",
"focal_countries = [c.strip().upper() for c in focal_countries_list]\n",
"\n",
"foc_str = ' OR '.join([country_mode+'='+c for c in focal_countries])\n",
"coop_str = ' OR '.join([country_mode+'='+c for c in coop_countries])\n",
"\n",
"coop_str"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 55,
"outputs": [
{
"data": {
"text/plain": "'CU=PEOPLES R CHINA OR CU=HONG KONG'"
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"foc_str"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 58,
"outputs": [
{
"data": {
"text/plain": "'(CU=PEOPLES R CHINA OR CU=HONG KONG) AND (CU=AUSTRIA OR CU=BELGIUM OR CU=BULGARIA OR CU=CROATIA OR CU=CYPRUS OR CU=CZECH REPUBLIC OR CU=DENMARK OR CU=ESTONIA OR CU=FINLAND OR CU=FRANCE OR CU=GERMANY OR CU=GREECE OR CU=HUNGARY OR CU=IRELAND OR CU=ITALY OR CU=LATVIA OR CU=LITHUANIA OR CU=LUXEMBOURG OR CU=MALTA OR CU=NETHERLANDS OR CU=POLAND OR CU=PORTUGAL OR CU=ROMANIA OR CU=SLOVAKIA OR CU=SLOVENIA OR CU=SPAIN OR CU=SWEDEN OR CU=NORWAY OR CU=SWITZERLAND OR CU=UNITED KINGDOM OR CU=ENGLAND OR CU=WALES OR CU=SCOTLAND OR CU=N IRELAND) AND (TS=(\"artificial intelligence\") OR TS=(\"machine learning\") OR TS=(\"neural network\") OR TS=(\"big data\") OR TS=(\"deep learning\") OR TS=(\"pattern recognition\") OR TS=(\"computer vision\") OR TS=(\"image classification\") OR TS=(\"reinforcement learning\") OR TS=(\"support vector machines\") OR TS=(\"recommender system\") OR TS=(\"random forest\") OR TS=(\"ensemble model\") OR TS=(\"image processing\") OR TS=(\"generative network\") OR TS=(\"ai ethic\") OR TS=(\"natural language processing\") OR TS=(\"clustering algorithm\") OR TS=(\"feature extraction\") OR TS=(\"time series forecast\") OR TS=(\"anomaly detection\") OR TS=(\"identity fraud detection\") OR TS=(\"dimensionality reduction\") OR TS=(\"feature elicitation\") OR TS=(\"chatbot\") OR TS=(\"clustering\") OR TS=(\"unsupervised learning\") OR TS=(\"supervised learning\") OR TS=(\"convolutional network\") OR TS=(\"adversarial network\"))'"
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"scope_query = f'({foc_str}) AND ({coop_str}) AND ({keywords_str})'\n",
"scope_query"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 60,
"outputs": [
{
"data": {
"text/plain": "'(CU=AUSTRIA OR CU=BELGIUM OR CU=BULGARIA OR CU=CROATIA OR CU=CYPRUS OR CU=CZECH REPUBLIC OR CU=DENMARK OR CU=ESTONIA OR CU=FINLAND OR CU=FRANCE OR CU=GERMANY OR CU=GREECE OR CU=HUNGARY OR CU=IRELAND OR CU=ITALY OR CU=LATVIA OR CU=LITHUANIA OR CU=LUXEMBOURG OR CU=MALTA OR CU=NETHERLANDS OR CU=POLAND OR CU=PORTUGAL OR CU=ROMANIA OR CU=SLOVAKIA OR CU=SLOVENIA OR CU=SPAIN OR CU=SWEDEN OR CU=NORWAY OR CU=SWITZERLAND OR CU=UNITED KINGDOM OR CU=ENGLAND OR CU=WALES OR CU=SCOTLAND OR CU=N IRELAND) AND (TS=(\"artificial intelligence\") OR TS=(\"machine learning\") OR TS=(\"neural network\") OR TS=(\"big data\") OR TS=(\"deep learning\") OR TS=(\"pattern recognition\") OR TS=(\"computer vision\") OR TS=(\"image classification\") OR TS=(\"reinforcement learning\") OR TS=(\"support vector machines\") OR TS=(\"recommender system\") OR TS=(\"random forest\") OR TS=(\"ensemble model\") OR TS=(\"image processing\") OR TS=(\"generative network\") OR TS=(\"ai ethic\") OR TS=(\"natural language processing\") OR TS=(\"clustering algorithm\") OR TS=(\"feature extraction\") OR TS=(\"time series forecast\") OR TS=(\"anomaly detection\") OR TS=(\"identity fraud detection\") OR TS=(\"dimensionality reduction\") OR TS=(\"feature elicitation\") OR TS=(\"chatbot\") OR TS=(\"clustering\") OR TS=(\"unsupervised learning\") OR TS=(\"supervised learning\") OR TS=(\"convolutional network\") OR TS=(\"adversarial network\"))'"
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ch_scope_query = f'({coop_str}) AND ({keywords_str})'\n",
"ch_scope_query"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

@ -8,9 +8,9 @@
"source": [ "source": [
"import numpy as np\n", "import numpy as np\n",
"import pandas as pd\n", "import pandas as pd\n",
"from tqdm import tqdm\n",
"import os\n", "import os\n",
"import shutil" "import shutil\n",
"from flashgeotext.geotext import GeoText"
] ]
}, },
{ {
@ -121,12 +121,263 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": 11,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Keywords Plus \n0 WOS:000852293800024 CONVOLUTIONAL NEURAL-NETWORK; DEEP LEARNING FR... \\\n9714 WOS:000540750000002 STATE-SPACE RECONSTRUCTION; SURFACE AIR-TEMPER... \n9697 WOS:000600708400002 COMPRESSIVE STRENGTH; MODELS; ADABOOST.RT; DUC... \n9699 WOS:000511965100005 STRUCTURAL RELIABILITY; FAILURE MODES \n9701 WOS:000663142500003 REFLECTED GPS SIGNALS; SOIL-MOISTURE; OCEAN; S... \n... ... ... \n3066 WOS:000528727500074 LOCAL SEARCH; ALGORITHM; VARIANCE; MODEL \n5097 WOS:000596139400001 INDUSTRY 4.0; MANAGEMENT; RISK; ANALYTICS; CHA... \n11369 WOS:000436774300069 NaN \n11368 WOS:000846290700001 PARTIAL LEAST-SQUARES; INFRARED-SPECTROSCOPY; ... \n11362 WOS:000480527800025 MICROWAVE DIELECTRIC BEHAVIOR; GPS SIGNALS; RE... \n\n Author Keywords \n0 Imaging; Three-dimensional displays; Electroma... \\\n9714 NaN \n9697 Plastic hinge length; RC columns; Machine lear... \n9699 system reliability; jacket platform; beta-unzi... \n9701 Cyclone GNSS (CYGNSS); Sea surface wind speed;... \n... ... \n3066 sea surface temperature; sea surface temperatu... \n5097 Big data finance; Big data in financial servic... \n11369 planetary gear; fault diagnosis; VMD; center f... \n11368 soil fertility class; reflectance spectroscopy... \n11362 global navigation satellite system (GNSS)-refl... \n\n Article Title \n0 Artificial Intelligence: New Frontiers in Real... \\\n9714 Detecting causality from time series in a mach... \n9697 Data-Driven Approach to Predict the Plastic Hi... \n9699 System Reliability Analysis of an Offshore Jac... \n9701 Analysis of coastal wind speed retrieval from ... \n... ... \n3066 Improved Particle Swarm Optimization for Sea S... \n5097 Current landscape and influence of big data on... \n11369 Planetary Gear Fault Diagnosis via Feature Ima... \n11368 How Well Can Reflectance Spectroscopy Allocate... \n11362 GNSS-R Soil Moisture Retrieval Based on a XGbo... \n\n Abstract \n0 In recent years, artificial intelligence (AI) ... \n9714 Detecting causality from observational data is... \n9697 Inelastic response of reinforced concrete colu... \n9699 This study investigates strategies for solving... \n9701 This paper demonstrates the capability and per... \n... ... \n3066 The Sea Surface Temperature (SST) is one of th... \n5097 Big data is one of the most recent business an... \n11369 Poor working environment leads to frequent fai... \n11368 Fertilization decisions depend on the measurem... \n11362 Global navigation satellite system (GNSS)-refl... \n\n[9889 rows x 5 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Keywords Plus</th>\n <th>Author Keywords</th>\n <th>Article Title</th>\n <th>Abstract</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000852293800024</td>\n <td>CONVOLUTIONAL NEURAL-NETWORK; DEEP LEARNING FR...</td>\n <td>Imaging; Three-dimensional displays; Electroma...</td>\n <td>Artificial Intelligence: New Frontiers in Real...</td>\n <td>In recent years, artificial intelligence (AI) ...</td>\n </tr>\n <tr>\n <th>9714</th>\n <td>WOS:000540750000002</td>\n <td>STATE-SPACE RECONSTRUCTION; SURFACE AIR-TEMPER...</td>\n <td>NaN</td>\n <td>Detecting causality from time series in a mach...</td>\n <td>Detecting causality from observational data is...</td>\n </tr>\n <tr>\n <th>9697</th>\n <td>WOS:000600708400002</td>\n <td>COMPRESSIVE STRENGTH; MODELS; ADABOOST.RT; DUC...</td>\n <td>Plastic hinge length; RC columns; Machine lear...</td>\n <td>Data-Driven Approach to Predict the Plastic Hi...</td>\n <td>Inelastic response of reinforced concrete colu...</td>\n </tr>\n <tr>\n <th>9699</th>\n <td>WOS:000511965100005</td>\n <td>STRUCTURAL RELIABILITY; FAILURE MODES</td>\n <td>system reliability; jacket platform; beta-unzi...</td>\n <td>System Reliability Analysis of an Offshore Jac...</td>\n <td>This study investigates strategies for solving...</td>\n </tr>\n <tr>\n <th>9701</th>\n <td>WOS:000663142500003</td>\n <td>REFLECTED GPS SIGNALS; SOIL-MOISTURE; OCEAN; S...</td>\n <td>Cyclone GNSS (CYGNSS); Sea surface wind speed;...</td>\n <td>Analysis of coastal wind speed retrieval from ...</td>\n <td>This paper demonstrates the capability and per...</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>3066</th>\n <td>WOS:000528727500074</td>\n <td>LOCAL SEARCH; ALGORITHM; VARIANCE; MODEL</td>\n <td>sea surface temperature; sea surface temperatu...</td>\n <td>Improved Particle Swarm Optimization for Sea S...</td>\n <td>The Sea Surface Temperature (SST) is one of th...</td>\n </tr>\n <tr>\n <th>5097</th>\n <td>WOS:000596139400001</td>\n <td>INDUSTRY 4.0; MANAGEMENT; RISK; ANALYTICS; CHA...</td>\n <td>Big data finance; Big data in financial servic...</td>\n <td>Current landscape and influence of big data on...</td>\n <td>Big data is one of the most recent business an...</td>\n </tr>\n <tr>\n <th>11369</th>\n <td>WOS:000436774300069</td>\n <td>NaN</td>\n <td>planetary gear; fault diagnosis; VMD; center f...</td>\n <td>Planetary Gear Fault Diagnosis via Feature Ima...</td>\n <td>Poor working environment leads to frequent fai...</td>\n </tr>\n <tr>\n <th>11368</th>\n <td>WOS:000846290700001</td>\n <td>PARTIAL LEAST-SQUARES; INFRARED-SPECTROSCOPY; ...</td>\n <td>soil fertility class; reflectance spectroscopy...</td>\n <td>How Well Can Reflectance Spectroscopy Allocate...</td>\n <td>Fertilization decisions depend on the measurem...</td>\n </tr>\n <tr>\n <th>11362</th>\n <td>WOS:000480527800025</td>\n <td>MICROWAVE DIELECTRIC BEHAVIOR; GPS SIGNALS; RE...</td>\n <td>global navigation satellite system (GNSS)-refl...</td>\n <td>GNSS-R Soil Moisture Retrieval Based on a XGbo...</td>\n <td>Global navigation satellite system (GNSS)-refl...</td>\n </tr>\n </tbody>\n</table>\n<p>9889 rows × 5 columns</p>\n</div>"
},
"execution_count": 11,
"metadata": {}, "metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos[[record_col,\"Keywords Plus\",\"Author Keywords\",\"Article Title\",\"Abstract\"]]\n"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 22,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) level_1 keyword\n0 WOS:000209536100003 11117 NaN\n1 WOS:000297893800037 10831 ADAPTIVE DYNAMIC SURFACE CONTROL\n2 WOS:000297893800037 10831 NEURAL COMPENSATOR\n3 WOS:000297893800037 10831 BUCK CONVERTER\n4 WOS:000297893800037 10831 FINITE-TIME IDENTIFIER\n... ... ... ...\n94060 WOS:000947693400001 240 EXPRESSION\n94061 WOS:000947693400001 240 RNALOCATE\n94062 WOS:000947693400001 240 PROTEINS\n94063 WOS:000947693400001 240 RESOURCE\n94064 WOS:000947693400001 240 CELLS\n\n[94065 rows x 3 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>level_1</th>\n <th>keyword</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000209536100003</td>\n <td>11117</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>1</th>\n <td>WOS:000297893800037</td>\n <td>10831</td>\n <td>ADAPTIVE DYNAMIC SURFACE CONTROL</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000297893800037</td>\n <td>10831</td>\n <td>NEURAL COMPENSATOR</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000297893800037</td>\n <td>10831</td>\n <td>BUCK CONVERTER</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000297893800037</td>\n <td>10831</td>\n <td>FINITE-TIME IDENTIFIER</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>94060</th>\n <td>WOS:000947693400001</td>\n <td>240</td>\n <td>EXPRESSION</td>\n </tr>\n <tr>\n <th>94061</th>\n <td>WOS:000947693400001</td>\n <td>240</td>\n <td>RNALOCATE</td>\n </tr>\n <tr>\n <th>94062</th>\n <td>WOS:000947693400001</td>\n <td>240</td>\n <td>PROTEINS</td>\n </tr>\n <tr>\n <th>94063</th>\n <td>WOS:000947693400001</td>\n <td>240</td>\n <td>RESOURCE</td>\n </tr>\n <tr>\n <th>94064</th>\n <td>WOS:000947693400001</td>\n <td>240</td>\n <td>CELLS</td>\n </tr>\n </tbody>\n</table>\n<p>94065 rows × 3 columns</p>\n</div>"
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kw_df = pd.DataFrame()\n",
"for c in [\"Keywords Plus\",\"Author Keywords\"]:\n",
" kwp = wos.groupby(record_col)[c].apply(lambda x: x.str.split(';')).explode().str.strip().str.upper()\n",
" kwp.name = 'keyword'\n",
" kw_df = pd.concat([kwp.reset_index(),kw_df],ignore_index=True)\n",
"kw_df"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 39,
"outputs": [
{
"data": {
"text/plain": "Downloading pytorch_model.bin: 0%| | 0.00/438M [00:00<?, ?B/s]",
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "43fb040512964c61bdcca3e35d4e9778"
}
},
"metadata": {},
"output_type": "display_data"
},
{
"ename": "ChunkedEncodingError",
"evalue": "(\"Connection broken: ConnectionResetError(10054, 'A létező kapcsolatot a távoli állomás kényszerítetten bezárta', None, 10054, None)\", ConnectionResetError(10054, 'A létező kapcsolatot a távoli állomás kényszerítetten bezárta', None, 10054, None))",
"output_type": "error",
"traceback": [
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[1;31mConnectionResetError\u001B[0m Traceback (most recent call last)",
"File \u001B[1;32m~\\Anaconda3\\envs\\MOME_BIGDATA\\lib\\site-packages\\urllib3\\response.py:444\u001B[0m, in \u001B[0;36mHTTPResponse._error_catcher\u001B[1;34m(self)\u001B[0m\n\u001B[0;32m 443\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[1;32m--> 444\u001B[0m \u001B[38;5;28;01myield\u001B[39;00m\n\u001B[0;32m 446\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m SocketTimeout:\n\u001B[0;32m 447\u001B[0m \u001B[38;5;66;03m# FIXME: Ideally we'd like to include the url in the ReadTimeoutError but\u001B[39;00m\n\u001B[0;32m 448\u001B[0m \u001B[38;5;66;03m# there is yet no clean way to get at it from this context.\u001B[39;00m\n",
"File \u001B[1;32m~\\Anaconda3\\envs\\MOME_BIGDATA\\lib\\site-packages\\urllib3\\response.py:567\u001B[0m, in \u001B[0;36mHTTPResponse.read\u001B[1;34m(self, amt, decode_content, cache_content)\u001B[0m\n\u001B[0;32m 566\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_error_catcher():\n\u001B[1;32m--> 567\u001B[0m data \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_fp_read\u001B[49m\u001B[43m(\u001B[49m\u001B[43mamt\u001B[49m\u001B[43m)\u001B[49m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m fp_closed \u001B[38;5;28;01melse\u001B[39;00m \u001B[38;5;124mb\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[0;32m 568\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m amt \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n",
"File \u001B[1;32m~\\Anaconda3\\envs\\MOME_BIGDATA\\lib\\site-packages\\urllib3\\response.py:533\u001B[0m, in \u001B[0;36mHTTPResponse._fp_read\u001B[1;34m(self, amt)\u001B[0m\n\u001B[0;32m 531\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m 532\u001B[0m \u001B[38;5;66;03m# StringIO doesn't like amt=None\u001B[39;00m\n\u001B[1;32m--> 533\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_fp\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mread\u001B[49m\u001B[43m(\u001B[49m\u001B[43mamt\u001B[49m\u001B[43m)\u001B[49m \u001B[38;5;28;01mif\u001B[39;00m amt \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m \u001B[38;5;28;01melse\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_fp\u001B[38;5;241m.\u001B[39mread()\n",
"File \u001B[1;32m~\\Anaconda3\\envs\\MOME_BIGDATA\\lib\\http\\client.py:463\u001B[0m, in \u001B[0;36mHTTPResponse.read\u001B[1;34m(self, amt)\u001B[0m\n\u001B[0;32m 462\u001B[0m b \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mbytearray\u001B[39m(amt)\n\u001B[1;32m--> 463\u001B[0m n \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mreadinto\u001B[49m\u001B[43m(\u001B[49m\u001B[43mb\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 464\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mmemoryview\u001B[39m(b)[:n]\u001B[38;5;241m.\u001B[39mtobytes()\n",
"File \u001B[1;32m~\\Anaconda3\\envs\\MOME_BIGDATA\\lib\\http\\client.py:507\u001B[0m, in \u001B[0;36mHTTPResponse.readinto\u001B[1;34m(self, b)\u001B[0m\n\u001B[0;32m 504\u001B[0m \u001B[38;5;66;03m# we do not use _safe_read() here because this may be a .will_close\u001B[39;00m\n\u001B[0;32m 505\u001B[0m \u001B[38;5;66;03m# connection, and the user is reading more bytes than will be provided\u001B[39;00m\n\u001B[0;32m 506\u001B[0m \u001B[38;5;66;03m# (for example, reading in 1k chunks)\u001B[39;00m\n\u001B[1;32m--> 507\u001B[0m n \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mfp\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mreadinto\u001B[49m\u001B[43m(\u001B[49m\u001B[43mb\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 508\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m n \u001B[38;5;129;01mand\u001B[39;00m b:\n\u001B[0;32m 509\u001B[0m \u001B[38;5;66;03m# Ideally, we would raise IncompleteRead if the content-length\u001B[39;00m\n\u001B[0;32m 510\u001B[0m \u001B[38;5;66;03m# wasn't satisfied, but it might break compatibility.\u001B[39;00m\n",
"File \u001B[1;32m~\\Anaconda3\\envs\\MOME_BIGDATA\\lib\\socket.py:704\u001B[0m, in \u001B[0;36mSocketIO.readinto\u001B[1;34m(self, b)\u001B[0m\n\u001B[0;32m 703\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[1;32m--> 704\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_sock\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mrecv_into\u001B[49m\u001B[43m(\u001B[49m\u001B[43mb\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 705\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m timeout:\n",
"File \u001B[1;32m~\\Anaconda3\\envs\\MOME_BIGDATA\\lib\\ssl.py:1242\u001B[0m, in \u001B[0;36mSSLSocket.recv_into\u001B[1;34m(self, buffer, nbytes, flags)\u001B[0m\n\u001B[0;32m 1239\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\n\u001B[0;32m 1240\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mnon-zero flags not allowed in calls to recv_into() on \u001B[39m\u001B[38;5;132;01m%s\u001B[39;00m\u001B[38;5;124m\"\u001B[39m \u001B[38;5;241m%\u001B[39m\n\u001B[0;32m 1241\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m\u001B[38;5;18m__class__\u001B[39m)\n\u001B[1;32m-> 1242\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mread\u001B[49m\u001B[43m(\u001B[49m\u001B[43mnbytes\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mbuffer\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 1243\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n",
"File \u001B[1;32m~\\Anaconda3\\envs\\MOME_BIGDATA\\lib\\ssl.py:1100\u001B[0m, in \u001B[0;36mSSLSocket.read\u001B[1;34m(self, len, buffer)\u001B[0m\n\u001B[0;32m 1099\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m buffer \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[1;32m-> 1100\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_sslobj\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mread\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mlen\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mbuffer\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 1101\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n",
"\u001B[1;31mConnectionResetError\u001B[0m: [WinError 10054] A létező kapcsolatot a távoli állomás kényszerítetten bezárta",
"\nDuring handling of the above exception, another exception occurred:\n",
"\u001B[1;31mProtocolError\u001B[0m Traceback (most recent call last)",
"File \u001B[1;32m~\\Anaconda3\\envs\\MOME_BIGDATA\\lib\\site-packages\\requests\\models.py:816\u001B[0m, in \u001B[0;36mResponse.iter_content.<locals>.generate\u001B[1;34m()\u001B[0m\n\u001B[0;32m 815\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[1;32m--> 816\u001B[0m \u001B[38;5;28;01myield from\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mraw\u001B[38;5;241m.\u001B[39mstream(chunk_size, decode_content\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mTrue\u001B[39;00m)\n\u001B[0;32m 817\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m ProtocolError \u001B[38;5;28;01mas\u001B[39;00m e:\n",
"File \u001B[1;32m~\\Anaconda3\\envs\\MOME_BIGDATA\\lib\\site-packages\\urllib3\\response.py:628\u001B[0m, in \u001B[0;36mHTTPResponse.stream\u001B[1;34m(self, amt, decode_content)\u001B[0m\n\u001B[0;32m 627\u001B[0m \u001B[38;5;28;01mwhile\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m is_fp_closed(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_fp):\n\u001B[1;32m--> 628\u001B[0m data \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mread\u001B[49m\u001B[43m(\u001B[49m\u001B[43mamt\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mamt\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mdecode_content\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mdecode_content\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 630\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m data:\n",
"File \u001B[1;32m~\\Anaconda3\\envs\\MOME_BIGDATA\\lib\\site-packages\\urllib3\\response.py:593\u001B[0m, in \u001B[0;36mHTTPResponse.read\u001B[1;34m(self, amt, decode_content, cache_content)\u001B[0m\n\u001B[0;32m 584\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39menforce_content_length \u001B[38;5;129;01mand\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mlength_remaining \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m (\n\u001B[0;32m 585\u001B[0m \u001B[38;5;241m0\u001B[39m,\n\u001B[0;32m 586\u001B[0m \u001B[38;5;28;01mNone\u001B[39;00m,\n\u001B[1;32m (...)\u001B[0m\n\u001B[0;32m 591\u001B[0m \u001B[38;5;66;03m# raised during streaming, so all calls with incorrect\u001B[39;00m\n\u001B[0;32m 592\u001B[0m \u001B[38;5;66;03m# Content-Length are caught.\u001B[39;00m\n\u001B[1;32m--> 593\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m IncompleteRead(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_fp_bytes_read, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mlength_remaining)\n\u001B[0;32m 595\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m data:\n",
"File \u001B[1;32m~\\Anaconda3\\envs\\MOME_BIGDATA\\lib\\contextlib.py:137\u001B[0m, in \u001B[0;36m_GeneratorContextManager.__exit__\u001B[1;34m(self, typ, value, traceback)\u001B[0m\n\u001B[0;32m 136\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[1;32m--> 137\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mgen\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mthrow\u001B[49m\u001B[43m(\u001B[49m\u001B[43mtyp\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mvalue\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mtraceback\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 138\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mStopIteration\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m exc:\n\u001B[0;32m 139\u001B[0m \u001B[38;5;66;03m# Suppress StopIteration *unless* it's the same exception that\u001B[39;00m\n\u001B[0;32m 140\u001B[0m \u001B[38;5;66;03m# was passed to throw(). This prevents a StopIteration\u001B[39;00m\n\u001B[0;32m 141\u001B[0m \u001B[38;5;66;03m# raised inside the \"with\" statement from being suppressed.\u001B[39;00m\n",
"File \u001B[1;32m~\\Anaconda3\\envs\\MOME_BIGDATA\\lib\\site-packages\\urllib3\\response.py:461\u001B[0m, in \u001B[0;36mHTTPResponse._error_catcher\u001B[1;34m(self)\u001B[0m\n\u001B[0;32m 459\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m (HTTPException, SocketError) \u001B[38;5;28;01mas\u001B[39;00m e:\n\u001B[0;32m 460\u001B[0m \u001B[38;5;66;03m# This includes IncompleteRead.\u001B[39;00m\n\u001B[1;32m--> 461\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m ProtocolError(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mConnection broken: \u001B[39m\u001B[38;5;132;01m%r\u001B[39;00m\u001B[38;5;124m\"\u001B[39m \u001B[38;5;241m%\u001B[39m e, e)\n\u001B[0;32m 463\u001B[0m \u001B[38;5;66;03m# If no exception is thrown, we should avoid cleaning up\u001B[39;00m\n\u001B[0;32m 464\u001B[0m \u001B[38;5;66;03m# unnecessarily.\u001B[39;00m\n",
"\u001B[1;31mProtocolError\u001B[0m: (\"Connection broken: ConnectionResetError(10054, 'A létező kapcsolatot a távoli állomás kényszerítetten bezárta', None, 10054, None)\", ConnectionResetError(10054, 'A létező kapcsolatot a távoli állomás kényszerítetten bezárta', None, 10054, None))",
"\nDuring handling of the above exception, another exception occurred:\n",
"\u001B[1;31mChunkedEncodingError\u001B[0m Traceback (most recent call last)",
"Cell \u001B[1;32mIn[39], line 7\u001B[0m\n\u001B[0;32m 4\u001B[0m \u001B[38;5;66;03m# Uses stopwords for english from NLTK, and all puntuation characters by\u001B[39;00m\n\u001B[0;32m 5\u001B[0m \u001B[38;5;66;03m# default\u001B[39;00m\n\u001B[0;32m 6\u001B[0m r \u001B[38;5;241m=\u001B[39m Rake()\n\u001B[1;32m----> 7\u001B[0m kw_model \u001B[38;5;241m=\u001B[39m \u001B[43mKeyBERT\u001B[49m\u001B[43m(\u001B[49m\u001B[43mmodel\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mall-mpnet-base-v2\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m)\u001B[49m\n",
"File \u001B[1;32m~\\Anaconda3\\envs\\MOME_BIGDATA\\lib\\site-packages\\keybert\\_model.py:55\u001B[0m, in \u001B[0;36mKeyBERT.__init__\u001B[1;34m(self, model)\u001B[0m\n\u001B[0;32m 39\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21m__init__\u001B[39m(\u001B[38;5;28mself\u001B[39m, model\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mall-MiniLM-L6-v2\u001B[39m\u001B[38;5;124m\"\u001B[39m):\n\u001B[0;32m 40\u001B[0m \u001B[38;5;124;03m\"\"\"KeyBERT initialization\u001B[39;00m\n\u001B[0;32m 41\u001B[0m \n\u001B[0;32m 42\u001B[0m \u001B[38;5;124;03m Arguments:\u001B[39;00m\n\u001B[1;32m (...)\u001B[0m\n\u001B[0;32m 53\u001B[0m \u001B[38;5;124;03m * https://www.sbert.net/docs/pretrained_models.html\u001B[39;00m\n\u001B[0;32m 54\u001B[0m \u001B[38;5;124;03m \"\"\"\u001B[39;00m\n\u001B[1;32m---> 55\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mmodel \u001B[38;5;241m=\u001B[39m \u001B[43mselect_backend\u001B[49m\u001B[43m(\u001B[49m\u001B[43mmodel\u001B[49m\u001B[43m)\u001B[49m\n",
"File \u001B[1;32m~\\Anaconda3\\envs\\MOME_BIGDATA\\lib\\site-packages\\keybert\\backend\\_utils.py:49\u001B[0m, in \u001B[0;36mselect_backend\u001B[1;34m(embedding_model)\u001B[0m\n\u001B[0;32m 47\u001B[0m \u001B[38;5;66;03m# Create a Sentence Transformer model based on a string\u001B[39;00m\n\u001B[0;32m 48\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(embedding_model, \u001B[38;5;28mstr\u001B[39m):\n\u001B[1;32m---> 49\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mSentenceTransformerBackend\u001B[49m\u001B[43m(\u001B[49m\u001B[43membedding_model\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 51\u001B[0m \u001B[38;5;66;03m# Hugging Face embeddings\u001B[39;00m\n\u001B[0;32m 52\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(embedding_model, Pipeline):\n",
"File \u001B[1;32m~\\Anaconda3\\envs\\MOME_BIGDATA\\lib\\site-packages\\keybert\\backend\\_sentencetransformers.py:42\u001B[0m, in \u001B[0;36mSentenceTransformerBackend.__init__\u001B[1;34m(self, embedding_model)\u001B[0m\n\u001B[0;32m 40\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39membedding_model \u001B[38;5;241m=\u001B[39m embedding_model\n\u001B[0;32m 41\u001B[0m \u001B[38;5;28;01melif\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(embedding_model, \u001B[38;5;28mstr\u001B[39m):\n\u001B[1;32m---> 42\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39membedding_model \u001B[38;5;241m=\u001B[39m \u001B[43mSentenceTransformer\u001B[49m\u001B[43m(\u001B[49m\u001B[43membedding_model\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 43\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m 44\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\n\u001B[0;32m 45\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mPlease select a correct SentenceTransformers model: \u001B[39m\u001B[38;5;130;01m\\n\u001B[39;00m\u001B[38;5;124m\"\u001B[39m\n\u001B[0;32m 46\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m`from sentence_transformers import SentenceTransformer` \u001B[39m\u001B[38;5;130;01m\\n\u001B[39;00m\u001B[38;5;124m\"\u001B[39m\n\u001B[0;32m 47\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m`model = SentenceTransformer(\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mall-MiniLM-L6-v2\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124m)`\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[0;32m 48\u001B[0m )\n",
"File \u001B[1;32m~\\Anaconda3\\envs\\MOME_BIGDATA\\lib\\site-packages\\sentence_transformers\\SentenceTransformer.py:87\u001B[0m, in \u001B[0;36mSentenceTransformer.__init__\u001B[1;34m(self, model_name_or_path, modules, device, cache_folder, use_auth_token)\u001B[0m\n\u001B[0;32m 83\u001B[0m model_path \u001B[38;5;241m=\u001B[39m os\u001B[38;5;241m.\u001B[39mpath\u001B[38;5;241m.\u001B[39mjoin(cache_folder, model_name_or_path\u001B[38;5;241m.\u001B[39mreplace(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m/\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m_\u001B[39m\u001B[38;5;124m\"\u001B[39m))\n\u001B[0;32m 85\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m os\u001B[38;5;241m.\u001B[39mpath\u001B[38;5;241m.\u001B[39mexists(os\u001B[38;5;241m.\u001B[39mpath\u001B[38;5;241m.\u001B[39mjoin(model_path, \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mmodules.json\u001B[39m\u001B[38;5;124m'\u001B[39m)):\n\u001B[0;32m 86\u001B[0m \u001B[38;5;66;03m# Download from hub with caching\u001B[39;00m\n\u001B[1;32m---> 87\u001B[0m \u001B[43msnapshot_download\u001B[49m\u001B[43m(\u001B[49m\u001B[43mmodel_name_or_path\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 88\u001B[0m \u001B[43m \u001B[49m\u001B[43mcache_dir\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mcache_folder\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 89\u001B[0m \u001B[43m \u001B[49m\u001B[43mlibrary_name\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43msentence-transformers\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m,\u001B[49m\n\u001B[0;32m 90\u001B[0m \u001B[43m \u001B[49m\u001B[43mlibrary_version\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43m__version__\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 91\u001B[0m \u001B[43m \u001B[49m\u001B[43mignore_files\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43m[\u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mflax_model.msgpack\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mrust_model.ot\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mtf_model.h5\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m]\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 92\u001B[0m \u001B[43m \u001B[49m\u001B[43muse_auth_token\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43muse_auth_token\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 94\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m os\u001B[38;5;241m.\u001B[39mpath\u001B[38;5;241m.\u001B[39mexists(os\u001B[38;5;241m.\u001B[39mpath\u001B[38;5;241m.\u001B[39mjoin(model_path, \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mmodules.json\u001B[39m\u001B[38;5;124m'\u001B[39m)): \u001B[38;5;66;03m#Load as SentenceTransformer model\u001B[39;00m\n\u001B[0;32m 95\u001B[0m modules \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_load_sbert_model(model_path)\n",
"File \u001B[1;32m~\\Anaconda3\\envs\\MOME_BIGDATA\\lib\\site-packages\\sentence_transformers\\util.py:491\u001B[0m, in \u001B[0;36msnapshot_download\u001B[1;34m(repo_id, revision, cache_dir, library_name, library_version, user_agent, ignore_files, use_auth_token)\u001B[0m\n\u001B[0;32m 486\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m version\u001B[38;5;241m.\u001B[39mparse(huggingface_hub\u001B[38;5;241m.\u001B[39m__version__) \u001B[38;5;241m>\u001B[39m\u001B[38;5;241m=\u001B[39m version\u001B[38;5;241m.\u001B[39mparse(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m0.8.1\u001B[39m\u001B[38;5;124m\"\u001B[39m):\n\u001B[0;32m 487\u001B[0m \u001B[38;5;66;03m# huggingface_hub v0.8.1 introduces a new cache layout. We sill use a manual layout\u001B[39;00m\n\u001B[0;32m 488\u001B[0m \u001B[38;5;66;03m# And need to pass legacy_cache_layout=True to avoid that a warning will be printed\u001B[39;00m\n\u001B[0;32m 489\u001B[0m cached_download_args[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mlegacy_cache_layout\u001B[39m\u001B[38;5;124m'\u001B[39m] \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mTrue\u001B[39;00m\n\u001B[1;32m--> 491\u001B[0m path \u001B[38;5;241m=\u001B[39m cached_download(\u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mcached_download_args)\n\u001B[0;32m 493\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m os\u001B[38;5;241m.\u001B[39mpath\u001B[38;5;241m.\u001B[39mexists(path \u001B[38;5;241m+\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m.lock\u001B[39m\u001B[38;5;124m\"\u001B[39m):\n\u001B[0;32m 494\u001B[0m os\u001B[38;5;241m.\u001B[39mremove(path \u001B[38;5;241m+\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m.lock\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n",
"File \u001B[1;32m~\\Anaconda3\\envs\\MOME_BIGDATA\\lib\\site-packages\\huggingface_hub\\utils\\_validators.py:120\u001B[0m, in \u001B[0;36mvalidate_hf_hub_args.<locals>._inner_fn\u001B[1;34m(*args, **kwargs)\u001B[0m\n\u001B[0;32m 117\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m check_use_auth_token:\n\u001B[0;32m 118\u001B[0m kwargs \u001B[38;5;241m=\u001B[39m smoothly_deprecate_use_auth_token(fn_name\u001B[38;5;241m=\u001B[39mfn\u001B[38;5;241m.\u001B[39m\u001B[38;5;18m__name__\u001B[39m, has_token\u001B[38;5;241m=\u001B[39mhas_token, kwargs\u001B[38;5;241m=\u001B[39mkwargs)\n\u001B[1;32m--> 120\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m fn(\u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n",
"File \u001B[1;32m~\\Anaconda3\\envs\\MOME_BIGDATA\\lib\\site-packages\\huggingface_hub\\file_download.py:780\u001B[0m, in \u001B[0;36mcached_download\u001B[1;34m(url, library_name, library_version, cache_dir, user_agent, force_download, force_filename, proxies, etag_timeout, resume_download, token, local_files_only, legacy_cache_layout)\u001B[0m\n\u001B[0;32m 777\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m temp_file_manager() \u001B[38;5;28;01mas\u001B[39;00m temp_file:\n\u001B[0;32m 778\u001B[0m logger\u001B[38;5;241m.\u001B[39minfo(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mdownloading \u001B[39m\u001B[38;5;132;01m%s\u001B[39;00m\u001B[38;5;124m to \u001B[39m\u001B[38;5;132;01m%s\u001B[39;00m\u001B[38;5;124m\"\u001B[39m, url, temp_file\u001B[38;5;241m.\u001B[39mname)\n\u001B[1;32m--> 780\u001B[0m \u001B[43mhttp_get\u001B[49m\u001B[43m(\u001B[49m\n\u001B[0;32m 781\u001B[0m \u001B[43m \u001B[49m\u001B[43murl_to_download\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 782\u001B[0m \u001B[43m \u001B[49m\u001B[43mtemp_file\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 783\u001B[0m \u001B[43m \u001B[49m\u001B[43mproxies\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mproxies\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 784\u001B[0m \u001B[43m \u001B[49m\u001B[43mresume_size\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mresume_size\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 785\u001B[0m \u001B[43m \u001B[49m\u001B[43mheaders\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mheaders\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 786\u001B[0m \u001B[43m \u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 788\u001B[0m logger\u001B[38;5;241m.\u001B[39minfo(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mstoring \u001B[39m\u001B[38;5;132;01m%s\u001B[39;00m\u001B[38;5;124m in cache at \u001B[39m\u001B[38;5;132;01m%s\u001B[39;00m\u001B[38;5;124m\"\u001B[39m, url, cache_path)\n\u001B[0;32m 789\u001B[0m _chmod_and_replace(temp_file\u001B[38;5;241m.\u001B[39mname, cache_path)\n",
"File \u001B[1;32m~\\Anaconda3\\envs\\MOME_BIGDATA\\lib\\site-packages\\huggingface_hub\\file_download.py:538\u001B[0m, in \u001B[0;36mhttp_get\u001B[1;34m(url, temp_file, proxies, resume_size, headers, timeout, max_retries)\u001B[0m\n\u001B[0;32m 528\u001B[0m displayed_name \u001B[38;5;241m=\u001B[39m \u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m(…)\u001B[39m\u001B[38;5;132;01m{\u001B[39;00mdisplayed_name[\u001B[38;5;241m-\u001B[39m\u001B[38;5;241m20\u001B[39m:]\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m\n\u001B[0;32m 530\u001B[0m progress \u001B[38;5;241m=\u001B[39m tqdm(\n\u001B[0;32m 531\u001B[0m unit\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mB\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n\u001B[0;32m 532\u001B[0m unit_scale\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mTrue\u001B[39;00m,\n\u001B[1;32m (...)\u001B[0m\n\u001B[0;32m 536\u001B[0m disable\u001B[38;5;241m=\u001B[39m\u001B[38;5;28mbool\u001B[39m(logger\u001B[38;5;241m.\u001B[39mgetEffectiveLevel() \u001B[38;5;241m==\u001B[39m logging\u001B[38;5;241m.\u001B[39mNOTSET),\n\u001B[0;32m 537\u001B[0m )\n\u001B[1;32m--> 538\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m chunk \u001B[38;5;129;01min\u001B[39;00m r\u001B[38;5;241m.\u001B[39miter_content(chunk_size\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m10\u001B[39m \u001B[38;5;241m*\u001B[39m \u001B[38;5;241m1024\u001B[39m \u001B[38;5;241m*\u001B[39m \u001B[38;5;241m1024\u001B[39m):\n\u001B[0;32m 539\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m chunk: \u001B[38;5;66;03m# filter out keep-alive new chunks\u001B[39;00m\n\u001B[0;32m 540\u001B[0m progress\u001B[38;5;241m.\u001B[39mupdate(\u001B[38;5;28mlen\u001B[39m(chunk))\n",
"File \u001B[1;32m~\\Anaconda3\\envs\\MOME_BIGDATA\\lib\\site-packages\\requests\\models.py:818\u001B[0m, in \u001B[0;36mResponse.iter_content.<locals>.generate\u001B[1;34m()\u001B[0m\n\u001B[0;32m 816\u001B[0m \u001B[38;5;28;01myield from\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mraw\u001B[38;5;241m.\u001B[39mstream(chunk_size, decode_content\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mTrue\u001B[39;00m)\n\u001B[0;32m 817\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m ProtocolError \u001B[38;5;28;01mas\u001B[39;00m e:\n\u001B[1;32m--> 818\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m ChunkedEncodingError(e)\n\u001B[0;32m 819\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m DecodeError \u001B[38;5;28;01mas\u001B[39;00m e:\n\u001B[0;32m 820\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m ContentDecodingError(e)\n",
"\u001B[1;31mChunkedEncodingError\u001B[0m: (\"Connection broken: ConnectionResetError(10054, 'A létező kapcsolatot a távoli állomás kényszerítetten bezárta', None, 10054, None)\", ConnectionResetError(10054, 'A létező kapcsolatot a távoli állomás kényszerítetten bezárta', None, 10054, None))"
]
}
],
"source": [
"from rake_nltk import Rake\n",
"from keybert import KeyBERT\n",
"\n",
"# Uses stopwords for english from NLTK, and all puntuation characters by\n",
"# default\n",
"r = Rake()\n",
"kw_model = KeyBERT(model='all-mpnet-base-v2')\n",
"\n",
"# Extraction given the text.\n",
"# r.extract_keywords_from_text(<text to process>)\n",
"\n",
"# keywords = kw_model.extract_keywords(full_text,\n",
"#\n",
"# keyphrase_ngram_range=(1, 3),\n",
"#\n",
"# stop_words='english',\n",
"#\n",
"# highlight=False,\n",
"#\n",
"# top_n=10)\n",
"#\n",
"# keywords_list= list(dict(keywords).keys())"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 32,
"outputs": [
{
"ename": "AttributeError",
"evalue": "'NoneType' object has no attribute 'get_ranked_phrases'",
"output_type": "error",
"traceback": [
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[1;31mAttributeError\u001B[0m Traceback (most recent call last)",
"Cell \u001B[1;32mIn[32], line 1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m \u001B[43mRake\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mextract_keywords_from_text\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mmy time to shine\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_ranked_phrases\u001B[49m()\n",
"\u001B[1;31mAttributeError\u001B[0m: 'NoneType' object has no attribute 'get_ranked_phrases'"
]
}
],
"source": [
"Rake().extract_keywords_from_text(\"my time to shine\").get_ranked_phrases()"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 37,
"outputs": [], "outputs": [],
"source": [ "source": [
"from flashgeotext.geotext import GeoText\n", "def kwd_rake(text):\n",
" r = Rake()\n",
" r.extract_keywords_from_sentences(text)\n",
" return r.get_ranked_phrases()\n",
"\n", "\n",
"kwds_rake = wos[\"Abstract\"].fillna(\"\").map(kwd_rake)\n",
"# kwds_bert = wos[\"A\"]\n"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 38,
"outputs": [
{
"data": {
"text/plain": "0 [brief summary could help us better understand...\n9714 [known phase space reconstruction based causal...\n9697 [column behavior requires accurate plastic hin...\n9699 [approach needs excessive computational effort...\n9701 [proposed ann model achieves good wind speed r...\n ... \n3066 [key factors affecting ocean climate change, r...\n5097 [big data influences different financial secto...\n11369 [planetary gear fault diagnosis via feature im...\n11368 [simultaneously predict various soil fertility...\n11362 [recently developed ensemble machine learning ...\nName: Abstract, Length: 9889, dtype: object"
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kwds_rake"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 28,
"outputs": [
{
"data": {
"text/plain": "0 None\n9714 None\n9697 None\n9699 None\n9701 None\n ... \n3066 None\n5097 None\n11369 None\n11368 None\n11362 None\nName: Abstract, Length: 9889, dtype: object"
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kwds_rake"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 19,
"outputs": [
{
"data": {
"text/plain": "'Keywords Plus'"
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"geotext = GeoText()\n", "geotext = GeoText()\n",
"\n", "\n",
"def extract_location(input_text, key='countries'):\n", "def extract_location(input_text, key='countries'):\n",
Loading…
Cancel
Save