You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
blabla/WOS/wos_concat.ipynb

344 lines
34 KiB
Plaintext

2 years ago
{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"from tqdm import tqdm\n",
"import os\n",
"import shutil"
]
},
{
"cell_type": "code",
"execution_count": 7,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"wos_extract\\wosexport1.xls\n",
"wos_extract\\wosexport10.xls\n",
"wos_extract\\wosexport11.xls\n",
"wos_extract\\wosexport2.xls\n",
"wos_extract\\wosexport3.xls\n",
"wos_extract\\wosexport4.xls\n",
"wos_extract\\wosexport5.xls\n",
"wos_extract\\wosexport6.xls\n",
"wos_extract\\wosexport7.xls\n",
"wos_extract\\wosexport8.xls\n",
"wos_extract\\wosexport9.xls\n"
]
}
],
"source": [
"workdir_path=r\"wos_extract\"\n",
"outfile='wos_extract_complete.csv'\n",
"with_header=True\n",
"for root, dirs, files in os.walk(workdir_path):\n",
" for filename in files:\n",
" if filename.startswith(\"wosexport\"):\n",
" path=os.path.join(root, filename)\n",
" print(path)\n",
" chunk = pd.read_excel(path)\n",
" chunk.to_csv(outfile, mode=\"a\", index=False, header=with_header, sep=\"\\t\")\n",
" with_header = False"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 39,
"outputs": [],
"source": [
"wos = pd.read_csv(outfile, sep=\"\\t\", encoding='ISO8859-1', low_memory=False)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 13,
"outputs": [],
"source": [
"# wos\n",
"record_col=\"UT (Unique WOS ID)\""
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 44,
"outputs": [],
"source": [
"# wos[[\"Addresses\",\"Affiliations\",record_col]]\n",
"import unidecode"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 51,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Addresses\n1 WOS:000209536100003 b'Edmunds, Scott C.; Hunter, Chris I.] BGI HK ...\n2 WOS:000209536100003 b'Smith, Vincent] Nat Hist Museum, London SW7 ...\n3 WOS:000209536100003 b'Stoev, Pavel; Penev, Lyubomir] Pensoft Publi...\n4 WOS:000209536100003 b'Stoev, Pavel] Nat Hist Museum, Natl Museum, ...\n5 WOS:000209536100003 b'Penev, Lyubomir] Bulgarian Acad Sci, Inst Bi...\n... ... ...\n74669 WOS:000947693400001 b'Wang, Shihang] ShanghaiTech Univ, Shanghai I...\n74670 WOS:000947693400001 b'Wang, Shihang; Peng, Sihua] Shanghai Ocean U...\n74671 WOS:000947693400001 b'Shen, Zhehan] Shanghai Jiao Tong Univ, Ruiji...\n74672 WOS:000947693400001 b'Shen, Zhehan; Liu, Taigang] Shanghai Ocean U...\n74673 WOS:000947693400001 b'Jiang, Linhua] Univ Politecn Valencia, Europ...\n\n[64339 rows x 2 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Addresses</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>1</th>\n <td>WOS:000209536100003</td>\n <td>b'Edmunds, Scott C.; Hunter, Chris I.] BGI HK ...</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000209536100003</td>\n <td>b'Smith, Vincent] Nat Hist Museum, London SW7 ...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000209536100003</td>\n <td>b'Stoev, Pavel; Penev, Lyubomir] Pensoft Publi...</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000209536100003</td>\n <td>b'Stoev, Pavel] Nat Hist Museum, Natl Museum, ...</td>\n </tr>\n <tr>\n <th>5</th>\n <td>WOS:000209536100003</td>\n <td>b'Penev, Lyubomir] Bulgarian Acad Sci, Inst Bi...</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>74669</th>\n <td>WOS:000947693400001</td>\n <td>b'Wang, Shihang] ShanghaiTech Univ, Shanghai I...</td>\n </tr>\n <tr>\n <th>74670</th>\n <td>WOS:000947693400001</td>\n <td>b'Wang, Shihang; Peng, Sihua] Shanghai Ocean U...</td>\n </tr>\n <tr>\n <th>74671</th>\n <td>WOS:000947693400001</td>\n <td>b'Shen, Zhehan] Shanghai Jiao Tong Univ, Ruiji...</td>\n </tr>\n <tr>\n <th>74672</th>\n <td>WOS:000947693400001</td>\n <td>b'Shen, Zhehan; Liu, Taigang] Shanghai Ocean U...</td>\n </tr>\n <tr>\n <th>74673</th>\n <td>WOS:000947693400001</td>\n <td>b'Jiang, Linhua] Univ Politecn Valencia, Europ...</td>\n </tr>\n </tbody>\n</table>\n<p>64339 rows × 2 columns</p>\n</div>"
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"locations = wos.groupby(record_col)[\"Addresses\"].apply(lambda x: x.str.split('[')).explode().reset_index().drop(columns=\"level_1\")\n",
"locations = locations[locations[\"Addresses\"]!=\"\"].copy()\n",
"locations[\"Addresses\"]=locations[\"Addresses\"].apply(lambda x: x.encode('utf-8'))\n",
"locations"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 48,
"outputs": [],
"source": [
"def extract_countries(text):\n",
" try:\n",
" return geograpy.get_place_context(text=x).countries\n",
" except:\n",
" return None"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 50,
"outputs": [
{
"ename": "UnicodeDecodeError",
"evalue": "'charmap' codec can't decode byte 0x83 in position 49: character maps to <undefined>",
"output_type": "error",
"traceback": [
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[1;31mUnicodeDecodeError\u001B[0m Traceback (most recent call last)",
"Cell \u001B[1;32mIn[50], line 2\u001B[0m\n\u001B[0;32m 1\u001B[0m text\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mStoev, Pavel; Penev, Lyubomir] Pensoft Publishers, Sofia, Bulgaria;\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[1;32m----> 2\u001B[0m \u001B[43mgeograpy\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_place_context\u001B[49m\u001B[43m(\u001B[49m\u001B[43mtext\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mtext\u001B[49m\u001B[43m)\u001B[49m\u001B[38;5;241m.\u001B[39mcountries\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\__init__.py:46\u001B[0m, in \u001B[0;36mget_place_context\u001B[1;34m(url, text, labels, debug)\u001B[0m\n\u001B[0;32m 44\u001B[0m e\u001B[38;5;241m.\u001B[39mfind_entities(labels\u001B[38;5;241m=\u001B[39mlabels)\n\u001B[0;32m 45\u001B[0m places\u001B[38;5;241m=\u001B[39me\u001B[38;5;241m.\u001B[39mplaces\n\u001B[1;32m---> 46\u001B[0m pc \u001B[38;5;241m=\u001B[39m \u001B[43mPlaceContext\u001B[49m\u001B[43m(\u001B[49m\u001B[43mplaces\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 47\u001B[0m pc\u001B[38;5;241m.\u001B[39msetAll()\n\u001B[0;32m 48\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m pc\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:32\u001B[0m, in \u001B[0;36mPlaceContext.__init__\u001B[1;34m(self, place_names, setAll, correctMisspelling)\u001B[0m\n\u001B[0;32m 30\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mplaces \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mnormalizePlaces(place_names)\n\u001B[0;32m 31\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m setAll:\n\u001B[1;32m---> 32\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43msetAll\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:90\u001B[0m, in \u001B[0;36mPlaceContext.setAll\u001B[1;34m(self)\u001B[0m\n\u001B[0;32m 88\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mset_regions()\n\u001B[0;32m 89\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mset_cities()\n\u001B[1;32m---> 90\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mset_other\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:206\u001B[0m, in \u001B[0;36mPlaceContext.set_other\u001B[1;34m(self)\u001B[0m\n\u001B[0;32m 203\u001B[0m places \u001B[38;5;241m=\u001B[39m [\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcountries, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcities, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mregions]\n\u001B[0;32m 204\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mall\u001B[39m(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcorrect_country_misspelling(place_name) \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m l \u001B[38;5;28;01mfor\u001B[39;00m l \u001B[38;5;129;01min\u001B[39;00m places)\n\u001B[1;32m--> 206\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mother \u001B[38;5;241m=\u001B[39m [p \u001B[38;5;28;01mfor\u001B[39;00m p \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mplaces \u001B[38;5;28;01mif\u001B[39;00m unused(p)]\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:206\u001B[0m, in \u001B[0;36m<listcomp>\u001B[1;34m(.0)\u001B[0m\n\u001B[0;32m 203\u001B[0m places \u001B[38;5;241m=\u001B[39m [\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcountries, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcities, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mregions]\n\u001B[0;32m 204\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mall\u001B[39m(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcorrect_country_misspelling(place_name) \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m l \u001B[38;5;28;01mfor\u001B[39;00m l \u001B[38;5;129;01min\u001B[39;00m places)\n\u001B[1;32m--> 206\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mother \u001B[38;5;241m=\u001B[39m [p \u001B[38;5;28;01mfor\u001B[39;00m p \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mplaces \u001B[38;5;28;01mif\u001B[39;00m \u001B[43munused\u001B[49m\u001B[43m(\u001B[49m\u001B[43mp\u001B[49m\u001B[43m)\u001B[49m]\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:204\u001B[0m, in \u001B[0;36mPlaceContext.set_other.<locals>.unused\u001B[1;34m(place_name)\u001B[0m\n\u001B[0;32m 202\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21munused\u001B[39m(place_name):\n\u001B[0;32m 203\u001B[0m places \u001B[38;5;241m=\u001B[39m [\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcountries, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcities, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mregions]\n\u001B[1;32m--> 204\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mall\u001B[39;49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcorrect_country_misspelling\u001B[49m\u001B[43m(\u001B[49m\u001B[43mplace_name\u001B[49m\u001B[43m)\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;129;43;01mnot\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[38;5;129;43;01min\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43ml\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43;01mfor\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43ml\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;129;43;01min\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43mplaces\u001B[49m\u001B[43m)\u001B[49m\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:204\u001B[0m, in \u001B[0;36m<genexpr>\u001B[1;34m(.0)\u001B[0m\n\u001B[0;32m 202\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21munused\u001B[39m(place_name):\n\u001B[0;32m 203\u001B[0m places \u001B[38;5;241m=\u001B[39m [\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcountries, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcities, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mregions]\n\u001B[1;32m--> 204\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mall\u001B[39m(\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcorrect_country_misspelling\u001B[49m\u001B[43m(\u001B[49m\u001B[43mplace_name\u001B[49m\u001B[43m)\u001B[49m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m l \u001B[38;5;28;01mfor\u001B[39;00m l \u001B[38;5;129;01min\u001B[39;00m places)\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\locator.py:1105\u001B[0m, in \u001B[0;36mLocator.correct_country_misspelling\u001B[1;34m(self, name)\u001B[0m\n\u001B[0;32m 1103\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m \u001B[38;5;28mopen\u001B[39m(cur_dir \u001B[38;5;241m+\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m/data/ISO3166ErrorDictionary.csv\u001B[39m\u001B[38;5;124m\"\u001B[39m) \u001B[38;5;28;01mas\u001B[39;00m info:\n\u001B[0;32m 1104\u001B[0m reader \u001B[38;5;241m=\u001B[39m csv\u001B[38;5;241m.\u001B[39mreader(info)\n\u001B[1;32m-> 1105\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m row \u001B[38;5;129;01min\u001B[39;00m reader:\n\u001B[0;32m 1106\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m name \u001B[38;5;241m==\u001B[39m remove_non_ascii(row[\u001B[38;5;241m0\u001B[39m]):\n\u001B[0;32m 1107\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m row[\u001B[38;5;241m2\u001B[39m]\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\encodings\\cp1250.py:23\u001B[0m, in \u001B[0;36mIncrementalDecoder.decode\u001B[1;34m(self, input, final)\u001B[0m\n\u001B[0;32m 22\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mdecode\u001B[39m(\u001B[38;5;28mself\u001B[39m, \u001B[38;5;28minput\u001B[39m, final\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mFalse\u001B[39;00m):\n\u001B[1;32m---> 23\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mcodecs\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcharmap_decode\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43minput\u001B[39;49m\u001B[43m,\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43merrors\u001B[49m\u001B[43m,\u001B[49m\u001B[43mdecoding_table\u001B[49m\u001B[43m)\u001B[49m[\u001B[38;5;241m0\u001B[39m]\n",
"\u001B[1;31mUnicodeDecodeError\u001B[0m: 'charmap' codec can't decode byte 0x83 in position 49: character maps to <undefined>"
]
}
],
"source": [
"text=\"Stoev, Pavel; Penev, Lyubomir] Pensoft Publishers, Sofia, Bulgaria;\"\n",
"geograpy.get_place_context(text=text).countries"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 53,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Addresses \n1 WOS:000209536100003 b'Edmunds, Scott C.; Hunter, Chris I.] BGI HK ... \\\n2 WOS:000209536100003 b'Smith, Vincent] Nat Hist Museum, London SW7 ... \n3 WOS:000209536100003 b'Stoev, Pavel; Penev, Lyubomir] Pensoft Publi... \n4 WOS:000209536100003 b'Stoev, Pavel] Nat Hist Museum, Natl Museum, ... \n5 WOS:000209536100003 b'Penev, Lyubomir] Bulgarian Acad Sci, Inst Bi... \n... ... ... \n74669 WOS:000947693400001 b'Wang, Shihang] ShanghaiTech Univ, Shanghai I... \n74670 WOS:000947693400001 b'Wang, Shihang; Peng, Sihua] Shanghai Ocean U... \n74671 WOS:000947693400001 b'Shen, Zhehan] Shanghai Jiao Tong Univ, Ruiji... \n74672 WOS:000947693400001 b'Shen, Zhehan; Liu, Taigang] Shanghai Ocean U... \n74673 WOS:000947693400001 b'Jiang, Linhua] Univ Politecn Valencia, Europ... \n\n Country \n1 None \n2 None \n3 None \n4 None \n5 None \n... ... \n74669 None \n74670 None \n74671 None \n74672 None \n74673 None \n\n[64339 rows x 3 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Addresses</th>\n <th>Country</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>1</th>\n <td>WOS:000209536100003</td>\n <td>b'Edmunds, Scott C.; Hunter, Chris I.] BGI HK ...</td>\n <td>None</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000209536100003</td>\n <td>b'Smith, Vincent] Nat Hist Museum, London SW7 ...</td>\n <td>None</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000209536100003</td>\n <td>b'Stoev, Pavel; Penev, Lyubomir] Pensoft Publi...</td>\n <td>None</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000209536100003</td>\n <td>b'Stoev, Pavel] Nat Hist Museum, Natl Museum, ...</td>\n <td>None</td>\n </tr>\n <tr>\n <th>5</th>\n <td>WOS:000209536100003</td>\n <td>b'Penev, Lyubomir] Bulgarian Acad Sci, Inst Bi...</td>\n <td>None</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>74669</th>\n <td>WOS:000947693400001</td>\n <td>b'Wang, Shihang] ShanghaiTech Univ, Shanghai I...</td>\n <td>None</td>\n </tr>\n <tr>\n <th>74670</th>\n <td>WOS:000947693400001</td>\n <td>b'Wang, Shihang; Peng, Sihua] Shanghai Ocean U...</td>\n <td>None</td>\n </tr>\n <tr>\n <th>74671</th>\n <td>WOS:000947693400001</td>\n <td>b'Shen, Zhehan] Shanghai Jiao Tong Univ, Ruiji...</td>\n <td>None</td>\n </tr>\n <tr>\n <th>74672</th>\n <td>WOS:000947693400001</td>\n <td>b'Shen, Zhehan; Liu, Taigang] Shanghai Ocean U...</td>\n <td>None</td>\n </tr>\n <tr>\n <th>74673</th>\n <td>WOS:000947693400001</td>\n <td>b'Jiang, Linhua] Univ Politecn Valencia, Europ...</td>\n <td>None</td>\n </tr>\n </tbody>\n</table>\n<p>64339 rows × 3 columns</p>\n</div>"
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"locations[\"Country\"]=locations['Addresses'].apply(lambda x: extract_countries())\n",
"locations"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 26,
"outputs": [
{
"ename": "NameError",
"evalue": "name 'nltk' is not defined",
"output_type": "error",
"traceback": [
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[1;31mNameError\u001B[0m Traceback (most recent call last)",
"Cell \u001B[1;32mIn[26], line 2\u001B[0m\n\u001B[0;32m 1\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mgeograpy\u001B[39;00m\n\u001B[1;32m----> 2\u001B[0m geograpy\u001B[38;5;241m-\u001B[39m\u001B[43mnltk\u001B[49m\n",
"\u001B[1;31mNameError\u001B[0m: name 'nltk' is not defined"
]
}
],
"source": [
"import geograpy\n",
"geograpy-nltk"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 27,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading C:\\Users\\radvanyi/.geograpy3/locations.db.gz from https://raw.githubusercontent.com/wiki/somnathrakshit/geograpy3/data/locations.db.gz ... this might take a few seconds\n",
"Unzipping C:\\Users\\radvanyi/.geograpy3/locations.db from C:\\Users\\radvanyi/.geograpy3/locations.db.gz\n",
"Extracting completed\n"
]
},
{
"ename": "UnicodeDecodeError",
"evalue": "'charmap' codec can't decode byte 0x83 in position 49: character maps to <undefined>",
"output_type": "error",
"traceback": [
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[1;31mUnicodeDecodeError\u001B[0m Traceback (most recent call last)",
"Cell \u001B[1;32mIn[27], line 3\u001B[0m\n\u001B[0;32m 1\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mgeograpy\u001B[39;00m\n\u001B[0;32m 2\u001B[0m url \u001B[38;5;241m=\u001B[39m \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mhttps://en.wikipedia.org/wiki/2012_Summer_Olympics_torch_relay\u001B[39m\u001B[38;5;124m'\u001B[39m\n\u001B[1;32m----> 3\u001B[0m places \u001B[38;5;241m=\u001B[39m \u001B[43mgeograpy\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_geoPlace_context\u001B[49m\u001B[43m(\u001B[49m\u001B[43murl\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43murl\u001B[49m\u001B[43m)\u001B[49m\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\__init__.py:24\u001B[0m, in \u001B[0;36mget_geoPlace_context\u001B[1;34m(url, text, debug)\u001B[0m\n\u001B[0;32m 9\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mget_geoPlace_context\u001B[39m(url\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mNone\u001B[39;00m, text\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mNone\u001B[39;00m,debug\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mFalse\u001B[39;00m):\n\u001B[0;32m 10\u001B[0m \u001B[38;5;250m \u001B[39m\u001B[38;5;124;03m'''\u001B[39;00m\n\u001B[0;32m 11\u001B[0m \u001B[38;5;124;03m Get a place context for a given text with information\u001B[39;00m\n\u001B[0;32m 12\u001B[0m \u001B[38;5;124;03m about country, region, city and other\u001B[39;00m\n\u001B[1;32m (...)\u001B[0m\n\u001B[0;32m 22\u001B[0m \u001B[38;5;124;03m PlaceContext: the place context\u001B[39;00m\n\u001B[0;32m 23\u001B[0m \u001B[38;5;124;03m '''\u001B[39;00m \n\u001B[1;32m---> 24\u001B[0m places\u001B[38;5;241m=\u001B[39m\u001B[43mget_place_context\u001B[49m\u001B[43m(\u001B[49m\u001B[43murl\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mtext\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mlabels\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mLabels\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mgeo\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mdebug\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mdebug\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 25\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m places\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\__init__.py:46\u001B[0m, in \u001B[0;36mget_place_context\u001B[1;34m(url, text, labels, debug)\u001B[0m\n\u001B[0;32m 44\u001B[0m e\u001B[38;5;241m.\u001B[39mfind_entities(labels\u001B[38;5;241m=\u001B[39mlabels)\n\u001B[0;32m 45\u001B[0m places\u001B[38;5;241m=\u001B[39me\u001B[38;5;241m.\u001B[39mplaces\n\u001B[1;32m---> 46\u001B[0m pc \u001B[38;5;241m=\u001B[39m \u001B[43mPlaceContext\u001B[49m\u001B[43m(\u001B[49m\u001B[43mplaces\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 47\u001B[0m pc\u001B[38;5;241m.\u001B[39msetAll()\n\u001B[0;32m 48\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m pc\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:32\u001B[0m, in \u001B[0;36mPlaceContext.__init__\u001B[1;34m(self, place_names, setAll, correctMisspelling)\u001B[0m\n\u001B[0;32m 30\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mplaces \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mnormalizePlaces(place_names)\n\u001B[0;32m 31\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m setAll:\n\u001B[1;32m---> 32\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43msetAll\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:90\u001B[0m, in \u001B[0;36mPlaceContext.setAll\u001B[1;34m(self)\u001B[0m\n\u001B[0;32m 88\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mset_regions()\n\u001B[0;32m 89\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mset_cities()\n\u001B[1;32m---> 90\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mset_other\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:206\u001B[0m, in \u001B[0;36mPlaceContext.set_other\u001B[1;34m(self)\u001B[0m\n\u001B[0;32m 203\u001B[0m places \u001B[38;5;241m=\u001B[39m [\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcountries, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcities, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mregions]\n\u001B[0;32m 204\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mall\u001B[39m(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcorrect_country_misspelling(place_name) \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m l \u001B[38;5;28;01mfor\u001B[39;00m l \u001B[38;5;129;01min\u001B[39;00m places)\n\u001B[1;32m--> 206\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mother \u001B[38;5;241m=\u001B[39m [p \u001B[38;5;28;01mfor\u001B[39;00m p \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mplaces \u001B[38;5;28;01mif\u001B[39;00m unused(p)]\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:206\u001B[0m, in \u001B[0;36m<listcomp>\u001B[1;34m(.0)\u001B[0m\n\u001B[0;32m 203\u001B[0m places \u001B[38;5;241m=\u001B[39m [\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcountries, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcities, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mregions]\n\u001B[0;32m 204\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mall\u001B[39m(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcorrect_country_misspelling(place_name) \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m l \u001B[38;5;28;01mfor\u001B[39;00m l \u001B[38;5;129;01min\u001B[39;00m places)\n\u001B[1;32m--> 206\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mother \u001B[38;5;241m=\u001B[39m [p \u001B[38;5;28;01mfor\u001B[39;00m p \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mplaces \u001B[38;5;28;01mif\u001B[39;00m \u001B[43munused\u001B[49m\u001B[43m(\u001B[49m\u001B[43mp\u001B[49m\u001B[43m)\u001B[49m]\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:204\u001B[0m, in \u001B[0;36mPlaceContext.set_other.<locals>.unused\u001B[1;34m(place_name)\u001B[0m\n\u001B[0;32m 202\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21munused\u001B[39m(place_name):\n\u001B[0;32m 203\u001B[0m places \u001B[38;5;241m=\u001B[39m [\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcountries, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcities, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mregions]\n\u001B[1;32m--> 204\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mall\u001B[39;49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcorrect_country_misspelling\u001B[49m\u001B[43m(\u001B[49m\u001B[43mplace_name\u001B[49m\u001B[43m)\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;129;43;01mnot\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[38;5;129;43;01min\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43ml\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43;01mfor\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43ml\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;129;43;01min\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43mplaces\u001B[49m\u001B[43m)\u001B[49m\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:204\u001B[0m, in \u001B[0;36m<genexpr>\u001B[1;34m(.0)\u001B[0m\n\u001B[0;32m 202\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21munused\u001B[39m(place_name):\n\u001B[0;32m 203\u001B[0m places \u001B[38;5;241m=\u001B[39m [\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcountries, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcities, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mregions]\n\u001B[1;32m--> 204\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mall\u001B[39m(\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcorrect_country_misspelling\u001B[49m\u001B[43m(\u001B[49m\u001B[43mplace_name\u001B[49m\u001B[43m)\u001B[49m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m l \u001B[38;5;28;01mfor\u001B[39;00m l \u001B[38;5;129;01min\u001B[39;00m places)\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\locator.py:1105\u001B[0m, in \u001B[0;36mLocator.correct_country_misspelling\u001B[1;34m(self, name)\u001B[0m\n\u001B[0;32m 1103\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m \u001B[38;5;28mopen\u001B[39m(cur_dir \u001B[38;5;241m+\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m/data/ISO3166ErrorDictionary.csv\u001B[39m\u001B[38;5;124m\"\u001B[39m) \u001B[38;5;28;01mas\u001B[39;00m info:\n\u001B[0;32m 1104\u001B[0m reader \u001B[38;5;241m=\u001B[39m csv\u001B[38;5;241m.\u001B[39mreader(info)\n\u001B[1;32m-> 1105\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m row \u001B[38;5;129;01min\u001B[39;00m reader:\n\u001B[0;32m 1106\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m name \u001B[38;5;241m==\u001B[39m remove_non_ascii(row[\u001B[38;5;241m0\u001B[39m]):\n\u001B[0;32m 1107\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m row[\u001B[38;5;241m2\u001B[39m]\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\encodings\\cp1250.py:23\u001B[0m, in \u001B[0;36mIncrementalDecoder.decode\u001B[1;34m(self, input, final)\u001B[0m\n\u001B[0;32m 22\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mdecode\u001B[39m(\u001B[38;5;28mself\u001B[39m, \u001B[38;5;28minput\u001B[39m, final\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mFalse\u001B[39;00m):\n\u001B[1;32m---> 23\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mcodecs\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcharmap_decode\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43minput\u001B[39;49m\u001B[43m,\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43merrors\u001B[49m\u001B[43m,\u001B[49m\u001B[43mdecoding_table\u001B[49m\u001B[43m)\u001B[49m[\u001B[38;5;241m0\u001B[39m]\n",
"\u001B[1;31mUnicodeDecodeError\u001B[0m: 'charmap' codec can't decode byte 0x83 in position 49: character maps to <undefined>"
]
}
],
"source": [
"import geograpy\n",
"url = 'https://en.wikipedia.org/wiki/2012_Summer_Olympics_torch_relay'\n",
"places = geograpy.get_geoPlace_context(url=url)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 28,
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to\n",
"[nltk_data] C:\\Users\\radvanyi\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package averaged_perceptron_tagger to\n",
"[nltk_data] C:\\Users\\radvanyi\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package averaged_perceptron_tagger is already up-to-\n",
"[nltk_data] date!\n",
"[nltk_data] Downloading package maxent_ne_chunker to\n",
"[nltk_data] C:\\Users\\radvanyi\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package maxent_ne_chunker is already up-to-date!\n",
"[nltk_data] Downloading package words to\n",
"[nltk_data] C:\\Users\\radvanyi\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package words is already up-to-date!\n"
]
},
{
"data": {
"text/plain": "True"
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import geograpy\n",
"import nltk\n",
"nltk.download('punkt')\n",
"nltk.download('averaged_perceptron_tagger')\n",
"nltk.download('maxent_ne_chunker')\n",
"nltk.download('words')"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 24,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"United Kingdom\n",
"United States\n"
]
}
],
"source": [
"# import pycountry\n",
"# text = \"United States (New York), United Kingdom (London)\"\n",
"# for country in pycountry.countries:\n",
"# if country.name in text:\n",
"# print(country.name)"
],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}