{ "cells": [ { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "from tqdm import tqdm\n", "import os\n", "import shutil" ] }, { "cell_type": "code", "execution_count": 7, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "wos_extract\\wosexport1.xls\n", "wos_extract\\wosexport10.xls\n", "wos_extract\\wosexport11.xls\n", "wos_extract\\wosexport2.xls\n", "wos_extract\\wosexport3.xls\n", "wos_extract\\wosexport4.xls\n", "wos_extract\\wosexport5.xls\n", "wos_extract\\wosexport6.xls\n", "wos_extract\\wosexport7.xls\n", "wos_extract\\wosexport8.xls\n", "wos_extract\\wosexport9.xls\n" ] } ], "source": [ "workdir_path=r\"wos_extract\"\n", "outfile='wos_extract_complete.csv'\n", "with_header=True\n", "for root, dirs, files in os.walk(workdir_path):\n", " for filename in files:\n", " if filename.startswith(\"wosexport\"):\n", " path=os.path.join(root, filename)\n", " print(path)\n", " chunk = pd.read_excel(path)\n", " chunk.to_csv(outfile, mode=\"a\", index=False, header=with_header, sep=\"\\t\")\n", " with_header = False" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 39, "outputs": [], "source": [ "wos = pd.read_csv(outfile, sep=\"\\t\", encoding='ISO8859-1', low_memory=False)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 13, "outputs": [], "source": [ "# wos\n", "record_col=\"UT (Unique WOS ID)\"" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 44, "outputs": [], "source": [ "# wos[[\"Addresses\",\"Affiliations\",record_col]]\n", "import unidecode" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 51, "outputs": [ { "data": { "text/plain": " UT (Unique WOS ID) Addresses\n1 WOS:000209536100003 b'Edmunds, Scott C.; Hunter, Chris I.] BGI HK ...\n2 WOS:000209536100003 b'Smith, Vincent] Nat Hist Museum, London SW7 ...\n3 WOS:000209536100003 b'Stoev, Pavel; Penev, Lyubomir] Pensoft Publi...\n4 WOS:000209536100003 b'Stoev, Pavel] Nat Hist Museum, Natl Museum, ...\n5 WOS:000209536100003 b'Penev, Lyubomir] Bulgarian Acad Sci, Inst Bi...\n... ... ...\n74669 WOS:000947693400001 b'Wang, Shihang] ShanghaiTech Univ, Shanghai I...\n74670 WOS:000947693400001 b'Wang, Shihang; Peng, Sihua] Shanghai Ocean U...\n74671 WOS:000947693400001 b'Shen, Zhehan] Shanghai Jiao Tong Univ, Ruiji...\n74672 WOS:000947693400001 b'Shen, Zhehan; Liu, Taigang] Shanghai Ocean U...\n74673 WOS:000947693400001 b'Jiang, Linhua] Univ Politecn Valencia, Europ...\n\n[64339 rows x 2 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
UT (Unique WOS ID)Addresses
1WOS:000209536100003b'Edmunds, Scott C.; Hunter, Chris I.] BGI HK ...
2WOS:000209536100003b'Smith, Vincent] Nat Hist Museum, London SW7 ...
3WOS:000209536100003b'Stoev, Pavel; Penev, Lyubomir] Pensoft Publi...
4WOS:000209536100003b'Stoev, Pavel] Nat Hist Museum, Natl Museum, ...
5WOS:000209536100003b'Penev, Lyubomir] Bulgarian Acad Sci, Inst Bi...
.........
74669WOS:000947693400001b'Wang, Shihang] ShanghaiTech Univ, Shanghai I...
74670WOS:000947693400001b'Wang, Shihang; Peng, Sihua] Shanghai Ocean U...
74671WOS:000947693400001b'Shen, Zhehan] Shanghai Jiao Tong Univ, Ruiji...
74672WOS:000947693400001b'Shen, Zhehan; Liu, Taigang] Shanghai Ocean U...
74673WOS:000947693400001b'Jiang, Linhua] Univ Politecn Valencia, Europ...
\n

64339 rows × 2 columns

\n
" }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "locations = wos.groupby(record_col)[\"Addresses\"].apply(lambda x: x.str.split('[')).explode().reset_index().drop(columns=\"level_1\")\n", "locations = locations[locations[\"Addresses\"]!=\"\"].copy()\n", "locations[\"Addresses\"]=locations[\"Addresses\"].apply(lambda x: x.encode('utf-8'))\n", "locations" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 48, "outputs": [], "source": [ "def extract_countries(text):\n", " try:\n", " return geograpy.get_place_context(text=x).countries\n", " except:\n", " return None" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 50, "outputs": [ { "ename": "UnicodeDecodeError", "evalue": "'charmap' codec can't decode byte 0x83 in position 49: character maps to ", "output_type": "error", "traceback": [ "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", "\u001B[1;31mUnicodeDecodeError\u001B[0m Traceback (most recent call last)", "Cell \u001B[1;32mIn[50], line 2\u001B[0m\n\u001B[0;32m 1\u001B[0m text\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mStoev, Pavel; Penev, Lyubomir] Pensoft Publishers, Sofia, Bulgaria;\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[1;32m----> 2\u001B[0m \u001B[43mgeograpy\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_place_context\u001B[49m\u001B[43m(\u001B[49m\u001B[43mtext\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mtext\u001B[49m\u001B[43m)\u001B[49m\u001B[38;5;241m.\u001B[39mcountries\n", "File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\__init__.py:46\u001B[0m, in \u001B[0;36mget_place_context\u001B[1;34m(url, text, labels, debug)\u001B[0m\n\u001B[0;32m 44\u001B[0m e\u001B[38;5;241m.\u001B[39mfind_entities(labels\u001B[38;5;241m=\u001B[39mlabels)\n\u001B[0;32m 45\u001B[0m places\u001B[38;5;241m=\u001B[39me\u001B[38;5;241m.\u001B[39mplaces\n\u001B[1;32m---> 46\u001B[0m pc \u001B[38;5;241m=\u001B[39m \u001B[43mPlaceContext\u001B[49m\u001B[43m(\u001B[49m\u001B[43mplaces\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 47\u001B[0m pc\u001B[38;5;241m.\u001B[39msetAll()\n\u001B[0;32m 48\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m pc\n", "File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:32\u001B[0m, in \u001B[0;36mPlaceContext.__init__\u001B[1;34m(self, place_names, setAll, correctMisspelling)\u001B[0m\n\u001B[0;32m 30\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mplaces \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mnormalizePlaces(place_names)\n\u001B[0;32m 31\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m setAll:\n\u001B[1;32m---> 32\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43msetAll\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n", "File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:90\u001B[0m, in \u001B[0;36mPlaceContext.setAll\u001B[1;34m(self)\u001B[0m\n\u001B[0;32m 88\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mset_regions()\n\u001B[0;32m 89\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mset_cities()\n\u001B[1;32m---> 90\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mset_other\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n", "File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:206\u001B[0m, in \u001B[0;36mPlaceContext.set_other\u001B[1;34m(self)\u001B[0m\n\u001B[0;32m 203\u001B[0m places \u001B[38;5;241m=\u001B[39m [\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcountries, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcities, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mregions]\n\u001B[0;32m 204\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mall\u001B[39m(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcorrect_country_misspelling(place_name) \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m l \u001B[38;5;28;01mfor\u001B[39;00m l \u001B[38;5;129;01min\u001B[39;00m places)\n\u001B[1;32m--> 206\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mother \u001B[38;5;241m=\u001B[39m [p \u001B[38;5;28;01mfor\u001B[39;00m p \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mplaces \u001B[38;5;28;01mif\u001B[39;00m unused(p)]\n", "File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:206\u001B[0m, in \u001B[0;36m\u001B[1;34m(.0)\u001B[0m\n\u001B[0;32m 203\u001B[0m places \u001B[38;5;241m=\u001B[39m [\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcountries, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcities, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mregions]\n\u001B[0;32m 204\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mall\u001B[39m(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcorrect_country_misspelling(place_name) \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m l \u001B[38;5;28;01mfor\u001B[39;00m l \u001B[38;5;129;01min\u001B[39;00m places)\n\u001B[1;32m--> 206\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mother \u001B[38;5;241m=\u001B[39m [p \u001B[38;5;28;01mfor\u001B[39;00m p \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mplaces \u001B[38;5;28;01mif\u001B[39;00m \u001B[43munused\u001B[49m\u001B[43m(\u001B[49m\u001B[43mp\u001B[49m\u001B[43m)\u001B[49m]\n", "File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:204\u001B[0m, in \u001B[0;36mPlaceContext.set_other..unused\u001B[1;34m(place_name)\u001B[0m\n\u001B[0;32m 202\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21munused\u001B[39m(place_name):\n\u001B[0;32m 203\u001B[0m places \u001B[38;5;241m=\u001B[39m [\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcountries, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcities, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mregions]\n\u001B[1;32m--> 204\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mall\u001B[39;49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcorrect_country_misspelling\u001B[49m\u001B[43m(\u001B[49m\u001B[43mplace_name\u001B[49m\u001B[43m)\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;129;43;01mnot\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[38;5;129;43;01min\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43ml\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43;01mfor\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43ml\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;129;43;01min\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43mplaces\u001B[49m\u001B[43m)\u001B[49m\n", "File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:204\u001B[0m, in \u001B[0;36m\u001B[1;34m(.0)\u001B[0m\n\u001B[0;32m 202\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21munused\u001B[39m(place_name):\n\u001B[0;32m 203\u001B[0m places \u001B[38;5;241m=\u001B[39m [\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcountries, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcities, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mregions]\n\u001B[1;32m--> 204\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mall\u001B[39m(\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcorrect_country_misspelling\u001B[49m\u001B[43m(\u001B[49m\u001B[43mplace_name\u001B[49m\u001B[43m)\u001B[49m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m l \u001B[38;5;28;01mfor\u001B[39;00m l \u001B[38;5;129;01min\u001B[39;00m places)\n", "File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\locator.py:1105\u001B[0m, in \u001B[0;36mLocator.correct_country_misspelling\u001B[1;34m(self, name)\u001B[0m\n\u001B[0;32m 1103\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m \u001B[38;5;28mopen\u001B[39m(cur_dir \u001B[38;5;241m+\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m/data/ISO3166ErrorDictionary.csv\u001B[39m\u001B[38;5;124m\"\u001B[39m) \u001B[38;5;28;01mas\u001B[39;00m info:\n\u001B[0;32m 1104\u001B[0m reader \u001B[38;5;241m=\u001B[39m csv\u001B[38;5;241m.\u001B[39mreader(info)\n\u001B[1;32m-> 1105\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m row \u001B[38;5;129;01min\u001B[39;00m reader:\n\u001B[0;32m 1106\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m name \u001B[38;5;241m==\u001B[39m remove_non_ascii(row[\u001B[38;5;241m0\u001B[39m]):\n\u001B[0;32m 1107\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m row[\u001B[38;5;241m2\u001B[39m]\n", "File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\encodings\\cp1250.py:23\u001B[0m, in \u001B[0;36mIncrementalDecoder.decode\u001B[1;34m(self, input, final)\u001B[0m\n\u001B[0;32m 22\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mdecode\u001B[39m(\u001B[38;5;28mself\u001B[39m, \u001B[38;5;28minput\u001B[39m, final\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mFalse\u001B[39;00m):\n\u001B[1;32m---> 23\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mcodecs\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcharmap_decode\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43minput\u001B[39;49m\u001B[43m,\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43merrors\u001B[49m\u001B[43m,\u001B[49m\u001B[43mdecoding_table\u001B[49m\u001B[43m)\u001B[49m[\u001B[38;5;241m0\u001B[39m]\n", "\u001B[1;31mUnicodeDecodeError\u001B[0m: 'charmap' codec can't decode byte 0x83 in position 49: character maps to " ] } ], "source": [ "text=\"Stoev, Pavel; Penev, Lyubomir] Pensoft Publishers, Sofia, Bulgaria;\"\n", "geograpy.get_place_context(text=text).countries" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 53, "outputs": [ { "data": { "text/plain": " UT (Unique WOS ID) Addresses \n1 WOS:000209536100003 b'Edmunds, Scott C.; Hunter, Chris I.] BGI HK ... \\\n2 WOS:000209536100003 b'Smith, Vincent] Nat Hist Museum, London SW7 ... \n3 WOS:000209536100003 b'Stoev, Pavel; Penev, Lyubomir] Pensoft Publi... \n4 WOS:000209536100003 b'Stoev, Pavel] Nat Hist Museum, Natl Museum, ... \n5 WOS:000209536100003 b'Penev, Lyubomir] Bulgarian Acad Sci, Inst Bi... \n... ... ... \n74669 WOS:000947693400001 b'Wang, Shihang] ShanghaiTech Univ, Shanghai I... \n74670 WOS:000947693400001 b'Wang, Shihang; Peng, Sihua] Shanghai Ocean U... \n74671 WOS:000947693400001 b'Shen, Zhehan] Shanghai Jiao Tong Univ, Ruiji... \n74672 WOS:000947693400001 b'Shen, Zhehan; Liu, Taigang] Shanghai Ocean U... \n74673 WOS:000947693400001 b'Jiang, Linhua] Univ Politecn Valencia, Europ... \n\n Country \n1 None \n2 None \n3 None \n4 None \n5 None \n... ... \n74669 None \n74670 None \n74671 None \n74672 None \n74673 None \n\n[64339 rows x 3 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
UT (Unique WOS ID)AddressesCountry
1WOS:000209536100003b'Edmunds, Scott C.; Hunter, Chris I.] BGI HK ...None
2WOS:000209536100003b'Smith, Vincent] Nat Hist Museum, London SW7 ...None
3WOS:000209536100003b'Stoev, Pavel; Penev, Lyubomir] Pensoft Publi...None
4WOS:000209536100003b'Stoev, Pavel] Nat Hist Museum, Natl Museum, ...None
5WOS:000209536100003b'Penev, Lyubomir] Bulgarian Acad Sci, Inst Bi...None
............
74669WOS:000947693400001b'Wang, Shihang] ShanghaiTech Univ, Shanghai I...None
74670WOS:000947693400001b'Wang, Shihang; Peng, Sihua] Shanghai Ocean U...None
74671WOS:000947693400001b'Shen, Zhehan] Shanghai Jiao Tong Univ, Ruiji...None
74672WOS:000947693400001b'Shen, Zhehan; Liu, Taigang] Shanghai Ocean U...None
74673WOS:000947693400001b'Jiang, Linhua] Univ Politecn Valencia, Europ...None
\n

64339 rows × 3 columns

\n
" }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "locations[\"Country\"]=locations['Addresses'].apply(lambda x: extract_countries())\n", "locations" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 26, "outputs": [ { "ename": "NameError", "evalue": "name 'nltk' is not defined", "output_type": "error", "traceback": [ "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", "\u001B[1;31mNameError\u001B[0m Traceback (most recent call last)", "Cell \u001B[1;32mIn[26], line 2\u001B[0m\n\u001B[0;32m 1\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mgeograpy\u001B[39;00m\n\u001B[1;32m----> 2\u001B[0m geograpy\u001B[38;5;241m-\u001B[39m\u001B[43mnltk\u001B[49m\n", "\u001B[1;31mNameError\u001B[0m: name 'nltk' is not defined" ] } ], "source": [ "import geograpy\n", "geograpy-nltk" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 27, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Downloading C:\\Users\\radvanyi/.geograpy3/locations.db.gz from https://raw.githubusercontent.com/wiki/somnathrakshit/geograpy3/data/locations.db.gz ... this might take a few seconds\n", "Unzipping C:\\Users\\radvanyi/.geograpy3/locations.db from C:\\Users\\radvanyi/.geograpy3/locations.db.gz\n", "Extracting completed\n" ] }, { "ename": "UnicodeDecodeError", "evalue": "'charmap' codec can't decode byte 0x83 in position 49: character maps to ", "output_type": "error", "traceback": [ "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", "\u001B[1;31mUnicodeDecodeError\u001B[0m Traceback (most recent call last)", "Cell \u001B[1;32mIn[27], line 3\u001B[0m\n\u001B[0;32m 1\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mgeograpy\u001B[39;00m\n\u001B[0;32m 2\u001B[0m url \u001B[38;5;241m=\u001B[39m \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mhttps://en.wikipedia.org/wiki/2012_Summer_Olympics_torch_relay\u001B[39m\u001B[38;5;124m'\u001B[39m\n\u001B[1;32m----> 3\u001B[0m places \u001B[38;5;241m=\u001B[39m \u001B[43mgeograpy\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_geoPlace_context\u001B[49m\u001B[43m(\u001B[49m\u001B[43murl\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43murl\u001B[49m\u001B[43m)\u001B[49m\n", "File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\__init__.py:24\u001B[0m, in \u001B[0;36mget_geoPlace_context\u001B[1;34m(url, text, debug)\u001B[0m\n\u001B[0;32m 9\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mget_geoPlace_context\u001B[39m(url\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mNone\u001B[39;00m, text\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mNone\u001B[39;00m,debug\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mFalse\u001B[39;00m):\n\u001B[0;32m 10\u001B[0m \u001B[38;5;250m \u001B[39m\u001B[38;5;124;03m'''\u001B[39;00m\n\u001B[0;32m 11\u001B[0m \u001B[38;5;124;03m Get a place context for a given text with information\u001B[39;00m\n\u001B[0;32m 12\u001B[0m \u001B[38;5;124;03m about country, region, city and other\u001B[39;00m\n\u001B[1;32m (...)\u001B[0m\n\u001B[0;32m 22\u001B[0m \u001B[38;5;124;03m PlaceContext: the place context\u001B[39;00m\n\u001B[0;32m 23\u001B[0m \u001B[38;5;124;03m '''\u001B[39;00m \n\u001B[1;32m---> 24\u001B[0m places\u001B[38;5;241m=\u001B[39m\u001B[43mget_place_context\u001B[49m\u001B[43m(\u001B[49m\u001B[43murl\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mtext\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mlabels\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mLabels\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mgeo\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mdebug\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mdebug\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 25\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m places\n", "File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\__init__.py:46\u001B[0m, in \u001B[0;36mget_place_context\u001B[1;34m(url, text, labels, debug)\u001B[0m\n\u001B[0;32m 44\u001B[0m e\u001B[38;5;241m.\u001B[39mfind_entities(labels\u001B[38;5;241m=\u001B[39mlabels)\n\u001B[0;32m 45\u001B[0m places\u001B[38;5;241m=\u001B[39me\u001B[38;5;241m.\u001B[39mplaces\n\u001B[1;32m---> 46\u001B[0m pc \u001B[38;5;241m=\u001B[39m \u001B[43mPlaceContext\u001B[49m\u001B[43m(\u001B[49m\u001B[43mplaces\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 47\u001B[0m pc\u001B[38;5;241m.\u001B[39msetAll()\n\u001B[0;32m 48\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m pc\n", "File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:32\u001B[0m, in \u001B[0;36mPlaceContext.__init__\u001B[1;34m(self, place_names, setAll, correctMisspelling)\u001B[0m\n\u001B[0;32m 30\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mplaces \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mnormalizePlaces(place_names)\n\u001B[0;32m 31\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m setAll:\n\u001B[1;32m---> 32\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43msetAll\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n", "File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:90\u001B[0m, in \u001B[0;36mPlaceContext.setAll\u001B[1;34m(self)\u001B[0m\n\u001B[0;32m 88\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mset_regions()\n\u001B[0;32m 89\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mset_cities()\n\u001B[1;32m---> 90\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mset_other\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n", "File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:206\u001B[0m, in \u001B[0;36mPlaceContext.set_other\u001B[1;34m(self)\u001B[0m\n\u001B[0;32m 203\u001B[0m places \u001B[38;5;241m=\u001B[39m [\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcountries, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcities, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mregions]\n\u001B[0;32m 204\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mall\u001B[39m(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcorrect_country_misspelling(place_name) \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m l \u001B[38;5;28;01mfor\u001B[39;00m l \u001B[38;5;129;01min\u001B[39;00m places)\n\u001B[1;32m--> 206\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mother \u001B[38;5;241m=\u001B[39m [p \u001B[38;5;28;01mfor\u001B[39;00m p \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mplaces \u001B[38;5;28;01mif\u001B[39;00m unused(p)]\n", "File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:206\u001B[0m, in \u001B[0;36m\u001B[1;34m(.0)\u001B[0m\n\u001B[0;32m 203\u001B[0m places \u001B[38;5;241m=\u001B[39m [\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcountries, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcities, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mregions]\n\u001B[0;32m 204\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mall\u001B[39m(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcorrect_country_misspelling(place_name) \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m l \u001B[38;5;28;01mfor\u001B[39;00m l \u001B[38;5;129;01min\u001B[39;00m places)\n\u001B[1;32m--> 206\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mother \u001B[38;5;241m=\u001B[39m [p \u001B[38;5;28;01mfor\u001B[39;00m p \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mplaces \u001B[38;5;28;01mif\u001B[39;00m \u001B[43munused\u001B[49m\u001B[43m(\u001B[49m\u001B[43mp\u001B[49m\u001B[43m)\u001B[49m]\n", "File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:204\u001B[0m, in \u001B[0;36mPlaceContext.set_other..unused\u001B[1;34m(place_name)\u001B[0m\n\u001B[0;32m 202\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21munused\u001B[39m(place_name):\n\u001B[0;32m 203\u001B[0m places \u001B[38;5;241m=\u001B[39m [\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcountries, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcities, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mregions]\n\u001B[1;32m--> 204\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mall\u001B[39;49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcorrect_country_misspelling\u001B[49m\u001B[43m(\u001B[49m\u001B[43mplace_name\u001B[49m\u001B[43m)\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;129;43;01mnot\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[38;5;129;43;01min\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43ml\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43;01mfor\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43ml\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;129;43;01min\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43mplaces\u001B[49m\u001B[43m)\u001B[49m\n", "File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:204\u001B[0m, in \u001B[0;36m\u001B[1;34m(.0)\u001B[0m\n\u001B[0;32m 202\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21munused\u001B[39m(place_name):\n\u001B[0;32m 203\u001B[0m places \u001B[38;5;241m=\u001B[39m [\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcountries, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcities, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mregions]\n\u001B[1;32m--> 204\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mall\u001B[39m(\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcorrect_country_misspelling\u001B[49m\u001B[43m(\u001B[49m\u001B[43mplace_name\u001B[49m\u001B[43m)\u001B[49m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m l \u001B[38;5;28;01mfor\u001B[39;00m l \u001B[38;5;129;01min\u001B[39;00m places)\n", "File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\locator.py:1105\u001B[0m, in \u001B[0;36mLocator.correct_country_misspelling\u001B[1;34m(self, name)\u001B[0m\n\u001B[0;32m 1103\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m \u001B[38;5;28mopen\u001B[39m(cur_dir \u001B[38;5;241m+\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m/data/ISO3166ErrorDictionary.csv\u001B[39m\u001B[38;5;124m\"\u001B[39m) \u001B[38;5;28;01mas\u001B[39;00m info:\n\u001B[0;32m 1104\u001B[0m reader \u001B[38;5;241m=\u001B[39m csv\u001B[38;5;241m.\u001B[39mreader(info)\n\u001B[1;32m-> 1105\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m row \u001B[38;5;129;01min\u001B[39;00m reader:\n\u001B[0;32m 1106\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m name \u001B[38;5;241m==\u001B[39m remove_non_ascii(row[\u001B[38;5;241m0\u001B[39m]):\n\u001B[0;32m 1107\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m row[\u001B[38;5;241m2\u001B[39m]\n", "File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\encodings\\cp1250.py:23\u001B[0m, in \u001B[0;36mIncrementalDecoder.decode\u001B[1;34m(self, input, final)\u001B[0m\n\u001B[0;32m 22\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mdecode\u001B[39m(\u001B[38;5;28mself\u001B[39m, \u001B[38;5;28minput\u001B[39m, final\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mFalse\u001B[39;00m):\n\u001B[1;32m---> 23\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mcodecs\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcharmap_decode\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43minput\u001B[39;49m\u001B[43m,\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43merrors\u001B[49m\u001B[43m,\u001B[49m\u001B[43mdecoding_table\u001B[49m\u001B[43m)\u001B[49m[\u001B[38;5;241m0\u001B[39m]\n", "\u001B[1;31mUnicodeDecodeError\u001B[0m: 'charmap' codec can't decode byte 0x83 in position 49: character maps to " ] } ], "source": [ "import geograpy\n", "url = 'https://en.wikipedia.org/wiki/2012_Summer_Olympics_torch_relay'\n", "places = geograpy.get_geoPlace_context(url=url)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 28, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to\n", "[nltk_data] C:\\Users\\radvanyi\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n", "[nltk_data] Downloading package averaged_perceptron_tagger to\n", "[nltk_data] C:\\Users\\radvanyi\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package averaged_perceptron_tagger is already up-to-\n", "[nltk_data] date!\n", "[nltk_data] Downloading package maxent_ne_chunker to\n", "[nltk_data] C:\\Users\\radvanyi\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package maxent_ne_chunker is already up-to-date!\n", "[nltk_data] Downloading package words to\n", "[nltk_data] C:\\Users\\radvanyi\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package words is already up-to-date!\n" ] }, { "data": { "text/plain": "True" }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import geograpy\n", "import nltk\n", "nltk.download('punkt')\n", "nltk.download('averaged_perceptron_tagger')\n", "nltk.download('maxent_ne_chunker')\n", "nltk.download('words')" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 24, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "United Kingdom\n", "United States\n" ] } ], "source": [ "# import pycountry\n", "# text = \"United States (New York), United Kingdom (London)\"\n", "# for country in pycountry.countries:\n", "# if country.name in text:\n", "# print(country.name)" ], "metadata": { "collapsed": false } } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 0 }