init commit

utku_keyword_suggestion
radvanyimome 2 years ago
commit e060613834

8
.idea/.gitignore vendored

@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.9 (MOME_BIGDATA)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="JupyterPackageInspection" enabled="false" level="WARNING" enabled_by_default="false" />
</profile>
</component>

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (MOME_BIGDATA)" project-jdk-type="Python SDK" />
</project>

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/ZSI_analytics.iml" filepath="$PROJECT_DIR$/.idea/ZSI_analytics.iml" />
</modules>
</component>
</project>

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -0,0 +1,292 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 143,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import pyarrow\n",
"from tqdm import tqdm\n",
"import os\n",
"import shutil"
]
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [
{
"data": {
"text/plain": "'2.0.0rc1'"
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.__version__"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [
{
"ename": "OptionError",
"evalue": "'You can only set the value of existing options'",
"output_type": "error",
"traceback": [
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[1;31mOptionError\u001B[0m Traceback (most recent call last)",
"Cell \u001B[1;32mIn[12], line 1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m \u001B[43mpd\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43moptions\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mmode\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mdtype_backend\u001B[49m \u001B[38;5;241m=\u001B[39m \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mpyarrow\u001B[39m\u001B[38;5;124m'\u001B[39m\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\pandas\\_config\\config.py:226\u001B[0m, in \u001B[0;36mDictWrapper.__setattr__\u001B[1;34m(self, key, val)\u001B[0m\n\u001B[0;32m 224\u001B[0m _set_option(prefix, val)\n\u001B[0;32m 225\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m--> 226\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m OptionError(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mYou can only set the value of existing options\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n",
"\u001B[1;31mOptionError\u001B[0m: 'You can only set the value of existing options'"
]
}
],
"source": [
"# pd.options.mode #= 'pyarrow'"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 119,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_01\n",
"D:\\PATSTAT\n"
]
}
],
"source": [
"import os\n",
"print(os.getcwd()) # Prints the current working directory\n",
"\n",
"workdir_path=r\"D:\\PATSTAT\"\n",
"os.chdir(workdir_path)\n",
"print(os.getcwd())"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CONCAT: ['tls201_part01.csv', 'tls201_part02.csv', 'tls201_part03.csv'] TO D:\\PATSTAT\\table_tls201.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"1000it [07:12, 2.31it/s]\n",
"1000it [07:25, 2.25it/s]\n",
"429it [02:55, 2.44it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"deleting\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_01\\tls201_part01.csv\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_01\\tls201_part02.csv\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_01\\tls201_part03.csv\n",
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_01\\tls202_part01.csv TO D:\\PATSTAT\\table_tls202.csv\n",
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_05\\tls204_part01\\tls204_part01.csv TO D:\\PATSTAT\\table_tls204.csv\n",
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_05\\tls205_part01\\tls205_part01.csv TO D:\\PATSTAT\\table_tls205.csv\n",
"CONCAT: ['tls206_part01.csv', 'tls206_part02.csv'] TO D:\\PATSTAT\\table_tls206.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"1000it [07:21, 2.27it/s]\n",
"715it [05:32, 2.15it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"deleting\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_05\\tls206_part01\\tls206_part01.csv\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_05\\tls206_part02\\tls206_part02.csv\n",
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_05\\tls207_part01\\tls207_part01.csv TO D:\\PATSTAT\\table_tls207.csv\n",
"CONCAT: ['tls209_part01.csv', 'tls209_part02.csv'] TO D:\\PATSTAT\\table_tls209.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"3200it [07:35, 7.02it/s]\n",
"3049it [07:15, 7.01it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"deleting\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_06\\tls209_part01\\tls209_part01.csv\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_06\\tls209_part02\\tls209_part02.csv\n",
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_06\\tls210_part01\\tls210_part01.csv TO D:\\PATSTAT\\table_tls210.csv\n",
"CONCAT: ['tls211_part01.csv', 'tls211_part02.csv'] TO D:\\PATSTAT\\table_tls211.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2500it [08:45, 4.76it/s]\n",
"369it [01:23, 4.42it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"deleting\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_06\\tls211_part01\\tls211_part01.csv\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_06\\tls211_part02\\tls211_part02.csv\n",
"CONCAT: ['tls212_part01.csv', 'tls212_part02.csv', 'tls212_part03.csv'] TO D:\\PATSTAT\\table_tls212.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"4000it [12:45, 5.23it/s]\n",
"4000it [12:57, 5.14it/s]\n",
"1232it [03:54, 5.25it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"deleting\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_06\\tls212_part01\\tls212_part01.csv\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_07\\tls212_part02\\tls212_part02.csv\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_07\\tls212_part03\\tls212_part03.csv\n",
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_07\\tls214_part01\\tls214_part01.csv TO D:\\PATSTAT\\table_tls214.csv\n",
"CONCAT: ['tls215_part01.csv', 'tls215_part02.csv', 'tls215_part03.csv'] TO D:\\PATSTAT\\table_tls215.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"8000it [15:10, 8.79it/s]\n",
"3524it [06:32, 9.30it/s]"
]
}
],
"source": [
"file_path_dict=dict()\n",
"\n",
"# iterate over files in\n",
"# that directory\n",
"for root, dirs, files in os.walk(workdir_path):\n",
" for filename in files:\n",
" if filename.endswith(\".csv\") and not filename.startswith(\"table\"):\n",
" path= os.path.join(root, filename)\n",
" file_path_dict[filename] = path\n",
"\n",
"complete_file_set = set()\n",
"for fname in file_path_dict.keys():\n",
" complete_file_set.add(fname.split(\"_\")[0])\n",
"complete_file_set = sorted(complete_file_set)\n",
"\n",
"for complete_file in complete_file_set:\n",
" file_list = [file for file in file_path_dict.keys() if complete_file in file]\n",
"\n",
" outfile_path = os.path.join(workdir_path,\"table_\"+complete_file+\".csv\")\n",
" # print(outfile_path,file_list)\n",
"\n",
" if len(file_list)==1:\n",
" file_path = file_path_dict.get(file_list[0])\n",
" print(\"MOVE:\",file_path ,\"TO\",outfile_path)\n",
" shutil.move(file_path, outfile_path)\n",
" else:\n",
" print(\"CONCAT:\",file_list ,\"TO\",outfile_path)\n",
"\n",
"\n",
" CHUNK_SIZE = 50000\n",
" with_header=True\n",
" first_one = True\n",
" for csv_file_name in file_list:\n",
" csv_file_path = file_path_dict.get(csv_file_name)\n",
" chunk_container = pd.read_csv(csv_file_path, chunksize=CHUNK_SIZE,low_memory=False)\n",
" for chunk in tqdm(chunk_container):\n",
" chunk.to_csv(outfile_path, mode=\"a\", index=False, header=with_header)\n",
" with_header=False\n",
" first_one = False\n",
"\n",
" print(\"deleting\")\n",
" for csv_file_name in file_list:\n",
" csv_file_path = file_path_dict.get(csv_file_name)\n",
" print(csv_file_path)\n",
" os.remove(csv_file_path)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"is_executing": true
}
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

@ -0,0 +1,343 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"from tqdm import tqdm\n",
"import os\n",
"import shutil"
]
},
{
"cell_type": "code",
"execution_count": 7,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"wos_extract\\wosexport1.xls\n",
"wos_extract\\wosexport10.xls\n",
"wos_extract\\wosexport11.xls\n",
"wos_extract\\wosexport2.xls\n",
"wos_extract\\wosexport3.xls\n",
"wos_extract\\wosexport4.xls\n",
"wos_extract\\wosexport5.xls\n",
"wos_extract\\wosexport6.xls\n",
"wos_extract\\wosexport7.xls\n",
"wos_extract\\wosexport8.xls\n",
"wos_extract\\wosexport9.xls\n"
]
}
],
"source": [
"workdir_path=r\"wos_extract\"\n",
"outfile='wos_extract_complete.csv'\n",
"with_header=True\n",
"for root, dirs, files in os.walk(workdir_path):\n",
" for filename in files:\n",
" if filename.startswith(\"wosexport\"):\n",
" path=os.path.join(root, filename)\n",
" print(path)\n",
" chunk = pd.read_excel(path)\n",
" chunk.to_csv(outfile, mode=\"a\", index=False, header=with_header, sep=\"\\t\")\n",
" with_header = False"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 39,
"outputs": [],
"source": [
"wos = pd.read_csv(outfile, sep=\"\\t\", encoding='ISO8859-1', low_memory=False)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 13,
"outputs": [],
"source": [
"# wos\n",
"record_col=\"UT (Unique WOS ID)\""
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 44,
"outputs": [],
"source": [
"# wos[[\"Addresses\",\"Affiliations\",record_col]]\n",
"import unidecode"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 51,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Addresses\n1 WOS:000209536100003 b'Edmunds, Scott C.; Hunter, Chris I.] BGI HK ...\n2 WOS:000209536100003 b'Smith, Vincent] Nat Hist Museum, London SW7 ...\n3 WOS:000209536100003 b'Stoev, Pavel; Penev, Lyubomir] Pensoft Publi...\n4 WOS:000209536100003 b'Stoev, Pavel] Nat Hist Museum, Natl Museum, ...\n5 WOS:000209536100003 b'Penev, Lyubomir] Bulgarian Acad Sci, Inst Bi...\n... ... ...\n74669 WOS:000947693400001 b'Wang, Shihang] ShanghaiTech Univ, Shanghai I...\n74670 WOS:000947693400001 b'Wang, Shihang; Peng, Sihua] Shanghai Ocean U...\n74671 WOS:000947693400001 b'Shen, Zhehan] Shanghai Jiao Tong Univ, Ruiji...\n74672 WOS:000947693400001 b'Shen, Zhehan; Liu, Taigang] Shanghai Ocean U...\n74673 WOS:000947693400001 b'Jiang, Linhua] Univ Politecn Valencia, Europ...\n\n[64339 rows x 2 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Addresses</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>1</th>\n <td>WOS:000209536100003</td>\n <td>b'Edmunds, Scott C.; Hunter, Chris I.] BGI HK ...</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000209536100003</td>\n <td>b'Smith, Vincent] Nat Hist Museum, London SW7 ...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000209536100003</td>\n <td>b'Stoev, Pavel; Penev, Lyubomir] Pensoft Publi...</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000209536100003</td>\n <td>b'Stoev, Pavel] Nat Hist Museum, Natl Museum, ...</td>\n </tr>\n <tr>\n <th>5</th>\n <td>WOS:000209536100003</td>\n <td>b'Penev, Lyubomir] Bulgarian Acad Sci, Inst Bi...</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>74669</th>\n <td>WOS:000947693400001</td>\n <td>b'Wang, Shihang] ShanghaiTech Univ, Shanghai I...</td>\n </tr>\n <tr>\n <th>74670</th>\n <td>WOS:000947693400001</td>\n <td>b'Wang, Shihang; Peng, Sihua] Shanghai Ocean U...</td>\n </tr>\n <tr>\n <th>74671</th>\n <td>WOS:000947693400001</td>\n <td>b'Shen, Zhehan] Shanghai Jiao Tong Univ, Ruiji...</td>\n </tr>\n <tr>\n <th>74672</th>\n <td>WOS:000947693400001</td>\n <td>b'Shen, Zhehan; Liu, Taigang] Shanghai Ocean U...</td>\n </tr>\n <tr>\n <th>74673</th>\n <td>WOS:000947693400001</td>\n <td>b'Jiang, Linhua] Univ Politecn Valencia, Europ...</td>\n </tr>\n </tbody>\n</table>\n<p>64339 rows × 2 columns</p>\n</div>"
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"locations = wos.groupby(record_col)[\"Addresses\"].apply(lambda x: x.str.split('[')).explode().reset_index().drop(columns=\"level_1\")\n",
"locations = locations[locations[\"Addresses\"]!=\"\"].copy()\n",
"locations[\"Addresses\"]=locations[\"Addresses\"].apply(lambda x: x.encode('utf-8'))\n",
"locations"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 48,
"outputs": [],
"source": [
"def extract_countries(text):\n",
" try:\n",
" return geograpy.get_place_context(text=x).countries\n",
" except:\n",
" return None"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 50,
"outputs": [
{
"ename": "UnicodeDecodeError",
"evalue": "'charmap' codec can't decode byte 0x83 in position 49: character maps to <undefined>",
"output_type": "error",
"traceback": [
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[1;31mUnicodeDecodeError\u001B[0m Traceback (most recent call last)",
"Cell \u001B[1;32mIn[50], line 2\u001B[0m\n\u001B[0;32m 1\u001B[0m text\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mStoev, Pavel; Penev, Lyubomir] Pensoft Publishers, Sofia, Bulgaria;\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[1;32m----> 2\u001B[0m \u001B[43mgeograpy\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_place_context\u001B[49m\u001B[43m(\u001B[49m\u001B[43mtext\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mtext\u001B[49m\u001B[43m)\u001B[49m\u001B[38;5;241m.\u001B[39mcountries\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\__init__.py:46\u001B[0m, in \u001B[0;36mget_place_context\u001B[1;34m(url, text, labels, debug)\u001B[0m\n\u001B[0;32m 44\u001B[0m e\u001B[38;5;241m.\u001B[39mfind_entities(labels\u001B[38;5;241m=\u001B[39mlabels)\n\u001B[0;32m 45\u001B[0m places\u001B[38;5;241m=\u001B[39me\u001B[38;5;241m.\u001B[39mplaces\n\u001B[1;32m---> 46\u001B[0m pc \u001B[38;5;241m=\u001B[39m \u001B[43mPlaceContext\u001B[49m\u001B[43m(\u001B[49m\u001B[43mplaces\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 47\u001B[0m pc\u001B[38;5;241m.\u001B[39msetAll()\n\u001B[0;32m 48\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m pc\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:32\u001B[0m, in \u001B[0;36mPlaceContext.__init__\u001B[1;34m(self, place_names, setAll, correctMisspelling)\u001B[0m\n\u001B[0;32m 30\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mplaces \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mnormalizePlaces(place_names)\n\u001B[0;32m 31\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m setAll:\n\u001B[1;32m---> 32\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43msetAll\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:90\u001B[0m, in \u001B[0;36mPlaceContext.setAll\u001B[1;34m(self)\u001B[0m\n\u001B[0;32m 88\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mset_regions()\n\u001B[0;32m 89\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mset_cities()\n\u001B[1;32m---> 90\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mset_other\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:206\u001B[0m, in \u001B[0;36mPlaceContext.set_other\u001B[1;34m(self)\u001B[0m\n\u001B[0;32m 203\u001B[0m places \u001B[38;5;241m=\u001B[39m [\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcountries, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcities, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mregions]\n\u001B[0;32m 204\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mall\u001B[39m(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcorrect_country_misspelling(place_name) \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m l \u001B[38;5;28;01mfor\u001B[39;00m l \u001B[38;5;129;01min\u001B[39;00m places)\n\u001B[1;32m--> 206\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mother \u001B[38;5;241m=\u001B[39m [p \u001B[38;5;28;01mfor\u001B[39;00m p \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mplaces \u001B[38;5;28;01mif\u001B[39;00m unused(p)]\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:206\u001B[0m, in \u001B[0;36m<listcomp>\u001B[1;34m(.0)\u001B[0m\n\u001B[0;32m 203\u001B[0m places \u001B[38;5;241m=\u001B[39m [\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcountries, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcities, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mregions]\n\u001B[0;32m 204\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mall\u001B[39m(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcorrect_country_misspelling(place_name) \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m l \u001B[38;5;28;01mfor\u001B[39;00m l \u001B[38;5;129;01min\u001B[39;00m places)\n\u001B[1;32m--> 206\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mother \u001B[38;5;241m=\u001B[39m [p \u001B[38;5;28;01mfor\u001B[39;00m p \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mplaces \u001B[38;5;28;01mif\u001B[39;00m \u001B[43munused\u001B[49m\u001B[43m(\u001B[49m\u001B[43mp\u001B[49m\u001B[43m)\u001B[49m]\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:204\u001B[0m, in \u001B[0;36mPlaceContext.set_other.<locals>.unused\u001B[1;34m(place_name)\u001B[0m\n\u001B[0;32m 202\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21munused\u001B[39m(place_name):\n\u001B[0;32m 203\u001B[0m places \u001B[38;5;241m=\u001B[39m [\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcountries, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcities, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mregions]\n\u001B[1;32m--> 204\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mall\u001B[39;49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcorrect_country_misspelling\u001B[49m\u001B[43m(\u001B[49m\u001B[43mplace_name\u001B[49m\u001B[43m)\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;129;43;01mnot\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[38;5;129;43;01min\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43ml\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43;01mfor\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43ml\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;129;43;01min\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43mplaces\u001B[49m\u001B[43m)\u001B[49m\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:204\u001B[0m, in \u001B[0;36m<genexpr>\u001B[1;34m(.0)\u001B[0m\n\u001B[0;32m 202\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21munused\u001B[39m(place_name):\n\u001B[0;32m 203\u001B[0m places \u001B[38;5;241m=\u001B[39m [\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcountries, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcities, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mregions]\n\u001B[1;32m--> 204\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mall\u001B[39m(\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcorrect_country_misspelling\u001B[49m\u001B[43m(\u001B[49m\u001B[43mplace_name\u001B[49m\u001B[43m)\u001B[49m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m l \u001B[38;5;28;01mfor\u001B[39;00m l \u001B[38;5;129;01min\u001B[39;00m places)\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\locator.py:1105\u001B[0m, in \u001B[0;36mLocator.correct_country_misspelling\u001B[1;34m(self, name)\u001B[0m\n\u001B[0;32m 1103\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m \u001B[38;5;28mopen\u001B[39m(cur_dir \u001B[38;5;241m+\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m/data/ISO3166ErrorDictionary.csv\u001B[39m\u001B[38;5;124m\"\u001B[39m) \u001B[38;5;28;01mas\u001B[39;00m info:\n\u001B[0;32m 1104\u001B[0m reader \u001B[38;5;241m=\u001B[39m csv\u001B[38;5;241m.\u001B[39mreader(info)\n\u001B[1;32m-> 1105\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m row \u001B[38;5;129;01min\u001B[39;00m reader:\n\u001B[0;32m 1106\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m name \u001B[38;5;241m==\u001B[39m remove_non_ascii(row[\u001B[38;5;241m0\u001B[39m]):\n\u001B[0;32m 1107\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m row[\u001B[38;5;241m2\u001B[39m]\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\encodings\\cp1250.py:23\u001B[0m, in \u001B[0;36mIncrementalDecoder.decode\u001B[1;34m(self, input, final)\u001B[0m\n\u001B[0;32m 22\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mdecode\u001B[39m(\u001B[38;5;28mself\u001B[39m, \u001B[38;5;28minput\u001B[39m, final\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mFalse\u001B[39;00m):\n\u001B[1;32m---> 23\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mcodecs\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcharmap_decode\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43minput\u001B[39;49m\u001B[43m,\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43merrors\u001B[49m\u001B[43m,\u001B[49m\u001B[43mdecoding_table\u001B[49m\u001B[43m)\u001B[49m[\u001B[38;5;241m0\u001B[39m]\n",
"\u001B[1;31mUnicodeDecodeError\u001B[0m: 'charmap' codec can't decode byte 0x83 in position 49: character maps to <undefined>"
]
}
],
"source": [
"text=\"Stoev, Pavel; Penev, Lyubomir] Pensoft Publishers, Sofia, Bulgaria;\"\n",
"geograpy.get_place_context(text=text).countries"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 53,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Addresses \n1 WOS:000209536100003 b'Edmunds, Scott C.; Hunter, Chris I.] BGI HK ... \\\n2 WOS:000209536100003 b'Smith, Vincent] Nat Hist Museum, London SW7 ... \n3 WOS:000209536100003 b'Stoev, Pavel; Penev, Lyubomir] Pensoft Publi... \n4 WOS:000209536100003 b'Stoev, Pavel] Nat Hist Museum, Natl Museum, ... \n5 WOS:000209536100003 b'Penev, Lyubomir] Bulgarian Acad Sci, Inst Bi... \n... ... ... \n74669 WOS:000947693400001 b'Wang, Shihang] ShanghaiTech Univ, Shanghai I... \n74670 WOS:000947693400001 b'Wang, Shihang; Peng, Sihua] Shanghai Ocean U... \n74671 WOS:000947693400001 b'Shen, Zhehan] Shanghai Jiao Tong Univ, Ruiji... \n74672 WOS:000947693400001 b'Shen, Zhehan; Liu, Taigang] Shanghai Ocean U... \n74673 WOS:000947693400001 b'Jiang, Linhua] Univ Politecn Valencia, Europ... \n\n Country \n1 None \n2 None \n3 None \n4 None \n5 None \n... ... \n74669 None \n74670 None \n74671 None \n74672 None \n74673 None \n\n[64339 rows x 3 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Addresses</th>\n <th>Country</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>1</th>\n <td>WOS:000209536100003</td>\n <td>b'Edmunds, Scott C.; Hunter, Chris I.] BGI HK ...</td>\n <td>None</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000209536100003</td>\n <td>b'Smith, Vincent] Nat Hist Museum, London SW7 ...</td>\n <td>None</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000209536100003</td>\n <td>b'Stoev, Pavel; Penev, Lyubomir] Pensoft Publi...</td>\n <td>None</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000209536100003</td>\n <td>b'Stoev, Pavel] Nat Hist Museum, Natl Museum, ...</td>\n <td>None</td>\n </tr>\n <tr>\n <th>5</th>\n <td>WOS:000209536100003</td>\n <td>b'Penev, Lyubomir] Bulgarian Acad Sci, Inst Bi...</td>\n <td>None</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>74669</th>\n <td>WOS:000947693400001</td>\n <td>b'Wang, Shihang] ShanghaiTech Univ, Shanghai I...</td>\n <td>None</td>\n </tr>\n <tr>\n <th>74670</th>\n <td>WOS:000947693400001</td>\n <td>b'Wang, Shihang; Peng, Sihua] Shanghai Ocean U...</td>\n <td>None</td>\n </tr>\n <tr>\n <th>74671</th>\n <td>WOS:000947693400001</td>\n <td>b'Shen, Zhehan] Shanghai Jiao Tong Univ, Ruiji...</td>\n <td>None</td>\n </tr>\n <tr>\n <th>74672</th>\n <td>WOS:000947693400001</td>\n <td>b'Shen, Zhehan; Liu, Taigang] Shanghai Ocean U...</td>\n <td>None</td>\n </tr>\n <tr>\n <th>74673</th>\n <td>WOS:000947693400001</td>\n <td>b'Jiang, Linhua] Univ Politecn Valencia, Europ...</td>\n <td>None</td>\n </tr>\n </tbody>\n</table>\n<p>64339 rows × 3 columns</p>\n</div>"
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"locations[\"Country\"]=locations['Addresses'].apply(lambda x: extract_countries())\n",
"locations"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 26,
"outputs": [
{
"ename": "NameError",
"evalue": "name 'nltk' is not defined",
"output_type": "error",
"traceback": [
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[1;31mNameError\u001B[0m Traceback (most recent call last)",
"Cell \u001B[1;32mIn[26], line 2\u001B[0m\n\u001B[0;32m 1\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mgeograpy\u001B[39;00m\n\u001B[1;32m----> 2\u001B[0m geograpy\u001B[38;5;241m-\u001B[39m\u001B[43mnltk\u001B[49m\n",
"\u001B[1;31mNameError\u001B[0m: name 'nltk' is not defined"
]
}
],
"source": [
"import geograpy\n",
"geograpy-nltk"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 27,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading C:\\Users\\radvanyi/.geograpy3/locations.db.gz from https://raw.githubusercontent.com/wiki/somnathrakshit/geograpy3/data/locations.db.gz ... this might take a few seconds\n",
"Unzipping C:\\Users\\radvanyi/.geograpy3/locations.db from C:\\Users\\radvanyi/.geograpy3/locations.db.gz\n",
"Extracting completed\n"
]
},
{
"ename": "UnicodeDecodeError",
"evalue": "'charmap' codec can't decode byte 0x83 in position 49: character maps to <undefined>",
"output_type": "error",
"traceback": [
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[1;31mUnicodeDecodeError\u001B[0m Traceback (most recent call last)",
"Cell \u001B[1;32mIn[27], line 3\u001B[0m\n\u001B[0;32m 1\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mgeograpy\u001B[39;00m\n\u001B[0;32m 2\u001B[0m url \u001B[38;5;241m=\u001B[39m \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mhttps://en.wikipedia.org/wiki/2012_Summer_Olympics_torch_relay\u001B[39m\u001B[38;5;124m'\u001B[39m\n\u001B[1;32m----> 3\u001B[0m places \u001B[38;5;241m=\u001B[39m \u001B[43mgeograpy\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_geoPlace_context\u001B[49m\u001B[43m(\u001B[49m\u001B[43murl\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43murl\u001B[49m\u001B[43m)\u001B[49m\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\__init__.py:24\u001B[0m, in \u001B[0;36mget_geoPlace_context\u001B[1;34m(url, text, debug)\u001B[0m\n\u001B[0;32m 9\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mget_geoPlace_context\u001B[39m(url\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mNone\u001B[39;00m, text\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mNone\u001B[39;00m,debug\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mFalse\u001B[39;00m):\n\u001B[0;32m 10\u001B[0m \u001B[38;5;250m \u001B[39m\u001B[38;5;124;03m'''\u001B[39;00m\n\u001B[0;32m 11\u001B[0m \u001B[38;5;124;03m Get a place context for a given text with information\u001B[39;00m\n\u001B[0;32m 12\u001B[0m \u001B[38;5;124;03m about country, region, city and other\u001B[39;00m\n\u001B[1;32m (...)\u001B[0m\n\u001B[0;32m 22\u001B[0m \u001B[38;5;124;03m PlaceContext: the place context\u001B[39;00m\n\u001B[0;32m 23\u001B[0m \u001B[38;5;124;03m '''\u001B[39;00m \n\u001B[1;32m---> 24\u001B[0m places\u001B[38;5;241m=\u001B[39m\u001B[43mget_place_context\u001B[49m\u001B[43m(\u001B[49m\u001B[43murl\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mtext\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mlabels\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mLabels\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mgeo\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mdebug\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mdebug\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 25\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m places\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\__init__.py:46\u001B[0m, in \u001B[0;36mget_place_context\u001B[1;34m(url, text, labels, debug)\u001B[0m\n\u001B[0;32m 44\u001B[0m e\u001B[38;5;241m.\u001B[39mfind_entities(labels\u001B[38;5;241m=\u001B[39mlabels)\n\u001B[0;32m 45\u001B[0m places\u001B[38;5;241m=\u001B[39me\u001B[38;5;241m.\u001B[39mplaces\n\u001B[1;32m---> 46\u001B[0m pc \u001B[38;5;241m=\u001B[39m \u001B[43mPlaceContext\u001B[49m\u001B[43m(\u001B[49m\u001B[43mplaces\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 47\u001B[0m pc\u001B[38;5;241m.\u001B[39msetAll()\n\u001B[0;32m 48\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m pc\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:32\u001B[0m, in \u001B[0;36mPlaceContext.__init__\u001B[1;34m(self, place_names, setAll, correctMisspelling)\u001B[0m\n\u001B[0;32m 30\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mplaces \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mnormalizePlaces(place_names)\n\u001B[0;32m 31\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m setAll:\n\u001B[1;32m---> 32\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43msetAll\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:90\u001B[0m, in \u001B[0;36mPlaceContext.setAll\u001B[1;34m(self)\u001B[0m\n\u001B[0;32m 88\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mset_regions()\n\u001B[0;32m 89\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mset_cities()\n\u001B[1;32m---> 90\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mset_other\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:206\u001B[0m, in \u001B[0;36mPlaceContext.set_other\u001B[1;34m(self)\u001B[0m\n\u001B[0;32m 203\u001B[0m places \u001B[38;5;241m=\u001B[39m [\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcountries, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcities, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mregions]\n\u001B[0;32m 204\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mall\u001B[39m(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcorrect_country_misspelling(place_name) \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m l \u001B[38;5;28;01mfor\u001B[39;00m l \u001B[38;5;129;01min\u001B[39;00m places)\n\u001B[1;32m--> 206\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mother \u001B[38;5;241m=\u001B[39m [p \u001B[38;5;28;01mfor\u001B[39;00m p \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mplaces \u001B[38;5;28;01mif\u001B[39;00m unused(p)]\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:206\u001B[0m, in \u001B[0;36m<listcomp>\u001B[1;34m(.0)\u001B[0m\n\u001B[0;32m 203\u001B[0m places \u001B[38;5;241m=\u001B[39m [\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcountries, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcities, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mregions]\n\u001B[0;32m 204\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mall\u001B[39m(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcorrect_country_misspelling(place_name) \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m l \u001B[38;5;28;01mfor\u001B[39;00m l \u001B[38;5;129;01min\u001B[39;00m places)\n\u001B[1;32m--> 206\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mother \u001B[38;5;241m=\u001B[39m [p \u001B[38;5;28;01mfor\u001B[39;00m p \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mplaces \u001B[38;5;28;01mif\u001B[39;00m \u001B[43munused\u001B[49m\u001B[43m(\u001B[49m\u001B[43mp\u001B[49m\u001B[43m)\u001B[49m]\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:204\u001B[0m, in \u001B[0;36mPlaceContext.set_other.<locals>.unused\u001B[1;34m(place_name)\u001B[0m\n\u001B[0;32m 202\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21munused\u001B[39m(place_name):\n\u001B[0;32m 203\u001B[0m places \u001B[38;5;241m=\u001B[39m [\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcountries, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcities, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mregions]\n\u001B[1;32m--> 204\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mall\u001B[39;49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcorrect_country_misspelling\u001B[49m\u001B[43m(\u001B[49m\u001B[43mplace_name\u001B[49m\u001B[43m)\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;129;43;01mnot\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[38;5;129;43;01min\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43ml\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43;01mfor\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43ml\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;129;43;01min\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43mplaces\u001B[49m\u001B[43m)\u001B[49m\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:204\u001B[0m, in \u001B[0;36m<genexpr>\u001B[1;34m(.0)\u001B[0m\n\u001B[0;32m 202\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21munused\u001B[39m(place_name):\n\u001B[0;32m 203\u001B[0m places \u001B[38;5;241m=\u001B[39m [\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcountries, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcities, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mregions]\n\u001B[1;32m--> 204\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mall\u001B[39m(\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcorrect_country_misspelling\u001B[49m\u001B[43m(\u001B[49m\u001B[43mplace_name\u001B[49m\u001B[43m)\u001B[49m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m l \u001B[38;5;28;01mfor\u001B[39;00m l \u001B[38;5;129;01min\u001B[39;00m places)\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\locator.py:1105\u001B[0m, in \u001B[0;36mLocator.correct_country_misspelling\u001B[1;34m(self, name)\u001B[0m\n\u001B[0;32m 1103\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m \u001B[38;5;28mopen\u001B[39m(cur_dir \u001B[38;5;241m+\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m/data/ISO3166ErrorDictionary.csv\u001B[39m\u001B[38;5;124m\"\u001B[39m) \u001B[38;5;28;01mas\u001B[39;00m info:\n\u001B[0;32m 1104\u001B[0m reader \u001B[38;5;241m=\u001B[39m csv\u001B[38;5;241m.\u001B[39mreader(info)\n\u001B[1;32m-> 1105\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m row \u001B[38;5;129;01min\u001B[39;00m reader:\n\u001B[0;32m 1106\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m name \u001B[38;5;241m==\u001B[39m remove_non_ascii(row[\u001B[38;5;241m0\u001B[39m]):\n\u001B[0;32m 1107\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m row[\u001B[38;5;241m2\u001B[39m]\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\encodings\\cp1250.py:23\u001B[0m, in \u001B[0;36mIncrementalDecoder.decode\u001B[1;34m(self, input, final)\u001B[0m\n\u001B[0;32m 22\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mdecode\u001B[39m(\u001B[38;5;28mself\u001B[39m, \u001B[38;5;28minput\u001B[39m, final\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mFalse\u001B[39;00m):\n\u001B[1;32m---> 23\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mcodecs\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcharmap_decode\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43minput\u001B[39;49m\u001B[43m,\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43merrors\u001B[49m\u001B[43m,\u001B[49m\u001B[43mdecoding_table\u001B[49m\u001B[43m)\u001B[49m[\u001B[38;5;241m0\u001B[39m]\n",
"\u001B[1;31mUnicodeDecodeError\u001B[0m: 'charmap' codec can't decode byte 0x83 in position 49: character maps to <undefined>"
]
}
],
"source": [
"import geograpy\n",
"url = 'https://en.wikipedia.org/wiki/2012_Summer_Olympics_torch_relay'\n",
"places = geograpy.get_geoPlace_context(url=url)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 28,
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to\n",
"[nltk_data] C:\\Users\\radvanyi\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package averaged_perceptron_tagger to\n",
"[nltk_data] C:\\Users\\radvanyi\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package averaged_perceptron_tagger is already up-to-\n",
"[nltk_data] date!\n",
"[nltk_data] Downloading package maxent_ne_chunker to\n",
"[nltk_data] C:\\Users\\radvanyi\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package maxent_ne_chunker is already up-to-date!\n",
"[nltk_data] Downloading package words to\n",
"[nltk_data] C:\\Users\\radvanyi\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package words is already up-to-date!\n"
]
},
{
"data": {
"text/plain": "True"
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import geograpy\n",
"import nltk\n",
"nltk.download('punkt')\n",
"nltk.download('averaged_perceptron_tagger')\n",
"nltk.download('maxent_ne_chunker')\n",
"nltk.download('words')"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 24,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"United Kingdom\n",
"United States\n"
]
}
],
"source": [
"# import pycountry\n",
"# text = \"United States (New York), United Kingdom (London)\"\n",
"# for country in pycountry.countries:\n",
"# if country.name in text:\n",
"# print(country.name)"
],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

File diff suppressed because one or more lines are too long

@ -0,0 +1,16 @@
# This is a sample Python script.
# Press Shift+F10 to execute it or replace it with your code.
# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
def print_hi(name):
# Use a breakpoint in the code line below to debug your script.
print(f'Hi, {name}') # Press Ctrl+F8 to toggle the breakpoint.
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
print_hi('PyCharm')
# See PyCharm help at https://www.jetbrains.com/help/pycharm/
Loading…
Cancel
Save