You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
blabla/WOS/wos_concat.ipynb

344 lines
34 KiB
Plaintext

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"from tqdm import tqdm\n",
"import os\n",
"import shutil"
]
},
{
"cell_type": "code",
"execution_count": 7,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"wos_extract\\wosexport1.xls\n",
"wos_extract\\wosexport10.xls\n",
"wos_extract\\wosexport11.xls\n",
"wos_extract\\wosexport2.xls\n",
"wos_extract\\wosexport3.xls\n",
"wos_extract\\wosexport4.xls\n",
"wos_extract\\wosexport5.xls\n",
"wos_extract\\wosexport6.xls\n",
"wos_extract\\wosexport7.xls\n",
"wos_extract\\wosexport8.xls\n",
"wos_extract\\wosexport9.xls\n"
]
}
],
"source": [
"workdir_path=r\"wos_extract\"\n",
"outfile='wos_extract_complete.csv'\n",
"with_header=True\n",
"for root, dirs, files in os.walk(workdir_path):\n",
" for filename in files:\n",
" if filename.startswith(\"wosexport\"):\n",
" path=os.path.join(root, filename)\n",
" print(path)\n",
" chunk = pd.read_excel(path)\n",
" chunk.to_csv(outfile, mode=\"a\", index=False, header=with_header, sep=\"\\t\")\n",
" with_header = False"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 39,
"outputs": [],
"source": [
"wos = pd.read_csv(outfile, sep=\"\\t\", encoding='ISO8859-1', low_memory=False)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 13,
"outputs": [],
"source": [
"# wos\n",
"record_col=\"UT (Unique WOS ID)\""
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 44,
"outputs": [],
"source": [
"# wos[[\"Addresses\",\"Affiliations\",record_col]]\n",
"import unidecode"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 51,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Addresses\n1 WOS:000209536100003 b'Edmunds, Scott C.; Hunter, Chris I.] BGI HK ...\n2 WOS:000209536100003 b'Smith, Vincent] Nat Hist Museum, London SW7 ...\n3 WOS:000209536100003 b'Stoev, Pavel; Penev, Lyubomir] Pensoft Publi...\n4 WOS:000209536100003 b'Stoev, Pavel] Nat Hist Museum, Natl Museum, ...\n5 WOS:000209536100003 b'Penev, Lyubomir] Bulgarian Acad Sci, Inst Bi...\n... ... ...\n74669 WOS:000947693400001 b'Wang, Shihang] ShanghaiTech Univ, Shanghai I...\n74670 WOS:000947693400001 b'Wang, Shihang; Peng, Sihua] Shanghai Ocean U...\n74671 WOS:000947693400001 b'Shen, Zhehan] Shanghai Jiao Tong Univ, Ruiji...\n74672 WOS:000947693400001 b'Shen, Zhehan; Liu, Taigang] Shanghai Ocean U...\n74673 WOS:000947693400001 b'Jiang, Linhua] Univ Politecn Valencia, Europ...\n\n[64339 rows x 2 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Addresses</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>1</th>\n <td>WOS:000209536100003</td>\n <td>b'Edmunds, Scott C.; Hunter, Chris I.] BGI HK ...</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000209536100003</td>\n <td>b'Smith, Vincent] Nat Hist Museum, London SW7 ...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000209536100003</td>\n <td>b'Stoev, Pavel; Penev, Lyubomir] Pensoft Publi...</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000209536100003</td>\n <td>b'Stoev, Pavel] Nat Hist Museum, Natl Museum, ...</td>\n </tr>\n <tr>\n <th>5</th>\n <td>WOS:000209536100003</td>\n <td>b'Penev, Lyubomir] Bulgarian Acad Sci, Inst Bi...</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>74669</th>\n <td>WOS:000947693400001</td>\n <td>b'Wang, Shihang] ShanghaiTech Univ, Shanghai I...</td>\n </tr>\n <tr>\n <th>74670</th>\n <td>WOS:000947693400001</td>\n <td>b'Wang, Shihang; Peng, Sihua] Shanghai Ocean U...</td>\n </tr>\n <tr>\n <th>74671</th>\n <td>WOS:000947693400001</td>\n <td>b'Shen, Zhehan] Shanghai Jiao Tong Univ, Ruiji...</td>\n </tr>\n <tr>\n <th>74672</th>\n <td>WOS:000947693400001</td>\n <td>b'Shen, Zhehan; Liu, Taigang] Shanghai Ocean U...</td>\n </tr>\n <tr>\n <th>74673</th>\n <td>WOS:000947693400001</td>\n <td>b'Jiang, Linhua] Univ Politecn Valencia, Europ...</td>\n </tr>\n </tbody>\n</table>\n<p>64339 rows × 2 columns</p>\n</div>"
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"locations = wos.groupby(record_col)[\"Addresses\"].apply(lambda x: x.str.split('[')).explode().reset_index().drop(columns=\"level_1\")\n",
"locations = locations[locations[\"Addresses\"]!=\"\"].copy()\n",
"locations[\"Addresses\"]=locations[\"Addresses\"].apply(lambda x: x.encode('utf-8'))\n",
"locations"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 48,
"outputs": [],
"source": [
"def extract_countries(text):\n",
" try:\n",
" return geograpy.get_place_context(text=x).countries\n",
" except:\n",
" return None"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 50,
"outputs": [
{
"ename": "UnicodeDecodeError",
"evalue": "'charmap' codec can't decode byte 0x83 in position 49: character maps to <undefined>",
"output_type": "error",
"traceback": [
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[1;31mUnicodeDecodeError\u001B[0m Traceback (most recent call last)",
"Cell \u001B[1;32mIn[50], line 2\u001B[0m\n\u001B[0;32m 1\u001B[0m text\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mStoev, Pavel; Penev, Lyubomir] Pensoft Publishers, Sofia, Bulgaria;\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[1;32m----> 2\u001B[0m \u001B[43mgeograpy\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_place_context\u001B[49m\u001B[43m(\u001B[49m\u001B[43mtext\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mtext\u001B[49m\u001B[43m)\u001B[49m\u001B[38;5;241m.\u001B[39mcountries\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\__init__.py:46\u001B[0m, in \u001B[0;36mget_place_context\u001B[1;34m(url, text, labels, debug)\u001B[0m\n\u001B[0;32m 44\u001B[0m e\u001B[38;5;241m.\u001B[39mfind_entities(labels\u001B[38;5;241m=\u001B[39mlabels)\n\u001B[0;32m 45\u001B[0m places\u001B[38;5;241m=\u001B[39me\u001B[38;5;241m.\u001B[39mplaces\n\u001B[1;32m---> 46\u001B[0m pc \u001B[38;5;241m=\u001B[39m \u001B[43mPlaceContext\u001B[49m\u001B[43m(\u001B[49m\u001B[43mplaces\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 47\u001B[0m pc\u001B[38;5;241m.\u001B[39msetAll()\n\u001B[0;32m 48\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m pc\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:32\u001B[0m, in \u001B[0;36mPlaceContext.__init__\u001B[1;34m(self, place_names, setAll, correctMisspelling)\u001B[0m\n\u001B[0;32m 30\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mplaces \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mnormalizePlaces(place_names)\n\u001B[0;32m 31\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m setAll:\n\u001B[1;32m---> 32\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43msetAll\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:90\u001B[0m, in \u001B[0;36mPlaceContext.setAll\u001B[1;34m(self)\u001B[0m\n\u001B[0;32m 88\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mset_regions()\n\u001B[0;32m 89\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mset_cities()\n\u001B[1;32m---> 90\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mset_other\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:206\u001B[0m, in \u001B[0;36mPlaceContext.set_other\u001B[1;34m(self)\u001B[0m\n\u001B[0;32m 203\u001B[0m places \u001B[38;5;241m=\u001B[39m [\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcountries, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcities, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mregions]\n\u001B[0;32m 204\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mall\u001B[39m(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcorrect_country_misspelling(place_name) \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m l \u001B[38;5;28;01mfor\u001B[39;00m l \u001B[38;5;129;01min\u001B[39;00m places)\n\u001B[1;32m--> 206\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mother \u001B[38;5;241m=\u001B[39m [p \u001B[38;5;28;01mfor\u001B[39;00m p \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mplaces \u001B[38;5;28;01mif\u001B[39;00m unused(p)]\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:206\u001B[0m, in \u001B[0;36m<listcomp>\u001B[1;34m(.0)\u001B[0m\n\u001B[0;32m 203\u001B[0m places \u001B[38;5;241m=\u001B[39m [\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcountries, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcities, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mregions]\n\u001B[0;32m 204\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mall\u001B[39m(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcorrect_country_misspelling(place_name) \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m l \u001B[38;5;28;01mfor\u001B[39;00m l \u001B[38;5;129;01min\u001B[39;00m places)\n\u001B[1;32m--> 206\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mother \u001B[38;5;241m=\u001B[39m [p \u001B[38;5;28;01mfor\u001B[39;00m p \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mplaces \u001B[38;5;28;01mif\u001B[39;00m \u001B[43munused\u001B[49m\u001B[43m(\u001B[49m\u001B[43mp\u001B[49m\u001B[43m)\u001B[49m]\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:204\u001B[0m, in \u001B[0;36mPlaceContext.set_other.<locals>.unused\u001B[1;34m(place_name)\u001B[0m\n\u001B[0;32m 202\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21munused\u001B[39m(place_name):\n\u001B[0;32m 203\u001B[0m places \u001B[38;5;241m=\u001B[39m [\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcountries, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcities, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mregions]\n\u001B[1;32m--> 204\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mall\u001B[39;49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcorrect_country_misspelling\u001B[49m\u001B[43m(\u001B[49m\u001B[43mplace_name\u001B[49m\u001B[43m)\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;129;43;01mnot\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[38;5;129;43;01min\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43ml\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43;01mfor\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43ml\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;129;43;01min\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43mplaces\u001B[49m\u001B[43m)\u001B[49m\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:204\u001B[0m, in \u001B[0;36m<genexpr>\u001B[1;34m(.0)\u001B[0m\n\u001B[0;32m 202\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21munused\u001B[39m(place_name):\n\u001B[0;32m 203\u001B[0m places \u001B[38;5;241m=\u001B[39m [\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcountries, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcities, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mregions]\n\u001B[1;32m--> 204\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mall\u001B[39m(\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcorrect_country_misspelling\u001B[49m\u001B[43m(\u001B[49m\u001B[43mplace_name\u001B[49m\u001B[43m)\u001B[49m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m l \u001B[38;5;28;01mfor\u001B[39;00m l \u001B[38;5;129;01min\u001B[39;00m places)\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\locator.py:1105\u001B[0m, in \u001B[0;36mLocator.correct_country_misspelling\u001B[1;34m(self, name)\u001B[0m\n\u001B[0;32m 1103\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m \u001B[38;5;28mopen\u001B[39m(cur_dir \u001B[38;5;241m+\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m/data/ISO3166ErrorDictionary.csv\u001B[39m\u001B[38;5;124m\"\u001B[39m) \u001B[38;5;28;01mas\u001B[39;00m info:\n\u001B[0;32m 1104\u001B[0m reader \u001B[38;5;241m=\u001B[39m csv\u001B[38;5;241m.\u001B[39mreader(info)\n\u001B[1;32m-> 1105\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m row \u001B[38;5;129;01min\u001B[39;00m reader:\n\u001B[0;32m 1106\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m name \u001B[38;5;241m==\u001B[39m remove_non_ascii(row[\u001B[38;5;241m0\u001B[39m]):\n\u001B[0;32m 1107\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m row[\u001B[38;5;241m2\u001B[39m]\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\encodings\\cp1250.py:23\u001B[0m, in \u001B[0;36mIncrementalDecoder.decode\u001B[1;34m(self, input, final)\u001B[0m\n\u001B[0;32m 22\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mdecode\u001B[39m(\u001B[38;5;28mself\u001B[39m, \u001B[38;5;28minput\u001B[39m, final\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mFalse\u001B[39;00m):\n\u001B[1;32m---> 23\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mcodecs\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcharmap_decode\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43minput\u001B[39;49m\u001B[43m,\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43merrors\u001B[49m\u001B[43m,\u001B[49m\u001B[43mdecoding_table\u001B[49m\u001B[43m)\u001B[49m[\u001B[38;5;241m0\u001B[39m]\n",
"\u001B[1;31mUnicodeDecodeError\u001B[0m: 'charmap' codec can't decode byte 0x83 in position 49: character maps to <undefined>"
]
}
],
"source": [
"text=\"Stoev, Pavel; Penev, Lyubomir] Pensoft Publishers, Sofia, Bulgaria;\"\n",
"geograpy.get_place_context(text=text).countries"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 53,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Addresses \n1 WOS:000209536100003 b'Edmunds, Scott C.; Hunter, Chris I.] BGI HK ... \\\n2 WOS:000209536100003 b'Smith, Vincent] Nat Hist Museum, London SW7 ... \n3 WOS:000209536100003 b'Stoev, Pavel; Penev, Lyubomir] Pensoft Publi... \n4 WOS:000209536100003 b'Stoev, Pavel] Nat Hist Museum, Natl Museum, ... \n5 WOS:000209536100003 b'Penev, Lyubomir] Bulgarian Acad Sci, Inst Bi... \n... ... ... \n74669 WOS:000947693400001 b'Wang, Shihang] ShanghaiTech Univ, Shanghai I... \n74670 WOS:000947693400001 b'Wang, Shihang; Peng, Sihua] Shanghai Ocean U... \n74671 WOS:000947693400001 b'Shen, Zhehan] Shanghai Jiao Tong Univ, Ruiji... \n74672 WOS:000947693400001 b'Shen, Zhehan; Liu, Taigang] Shanghai Ocean U... \n74673 WOS:000947693400001 b'Jiang, Linhua] Univ Politecn Valencia, Europ... \n\n Country \n1 None \n2 None \n3 None \n4 None \n5 None \n... ... \n74669 None \n74670 None \n74671 None \n74672 None \n74673 None \n\n[64339 rows x 3 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Addresses</th>\n <th>Country</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>1</th>\n <td>WOS:000209536100003</td>\n <td>b'Edmunds, Scott C.; Hunter, Chris I.] BGI HK ...</td>\n <td>None</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000209536100003</td>\n <td>b'Smith, Vincent] Nat Hist Museum, London SW7 ...</td>\n <td>None</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000209536100003</td>\n <td>b'Stoev, Pavel; Penev, Lyubomir] Pensoft Publi...</td>\n <td>None</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000209536100003</td>\n <td>b'Stoev, Pavel] Nat Hist Museum, Natl Museum, ...</td>\n <td>None</td>\n </tr>\n <tr>\n <th>5</th>\n <td>WOS:000209536100003</td>\n <td>b'Penev, Lyubomir] Bulgarian Acad Sci, Inst Bi...</td>\n <td>None</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>74669</th>\n <td>WOS:000947693400001</td>\n <td>b'Wang, Shihang] ShanghaiTech Univ, Shanghai I...</td>\n <td>None</td>\n </tr>\n <tr>\n <th>74670</th>\n <td>WOS:000947693400001</td>\n <td>b'Wang, Shihang; Peng, Sihua] Shanghai Ocean U...</td>\n <td>None</td>\n </tr>\n <tr>\n <th>74671</th>\n <td>WOS:000947693400001</td>\n <td>b'Shen, Zhehan] Shanghai Jiao Tong Univ, Ruiji...</td>\n <td>None</td>\n </tr>\n <tr>\n <th>74672</th>\n <td>WOS:000947693400001</td>\n <td>b'Shen, Zhehan; Liu, Taigang] Shanghai Ocean U...</td>\n <td>None</td>\n </tr>\n <tr>\n <th>74673</th>\n <td>WOS:000947693400001</td>\n <td>b'Jiang, Linhua] Univ Politecn Valencia, Europ...</td>\n <td>None</td>\n </tr>\n </tbody>\n</table>\n<p>64339 rows × 3 columns</p>\n</div>"
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"locations[\"Country\"]=locations['Addresses'].apply(lambda x: extract_countries())\n",
"locations"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 26,
"outputs": [
{
"ename": "NameError",
"evalue": "name 'nltk' is not defined",
"output_type": "error",
"traceback": [
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[1;31mNameError\u001B[0m Traceback (most recent call last)",
"Cell \u001B[1;32mIn[26], line 2\u001B[0m\n\u001B[0;32m 1\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mgeograpy\u001B[39;00m\n\u001B[1;32m----> 2\u001B[0m geograpy\u001B[38;5;241m-\u001B[39m\u001B[43mnltk\u001B[49m\n",
"\u001B[1;31mNameError\u001B[0m: name 'nltk' is not defined"
]
}
],
"source": [
"import geograpy\n",
"geograpy-nltk"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 27,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading C:\\Users\\radvanyi/.geograpy3/locations.db.gz from https://raw.githubusercontent.com/wiki/somnathrakshit/geograpy3/data/locations.db.gz ... this might take a few seconds\n",
"Unzipping C:\\Users\\radvanyi/.geograpy3/locations.db from C:\\Users\\radvanyi/.geograpy3/locations.db.gz\n",
"Extracting completed\n"
]
},
{
"ename": "UnicodeDecodeError",
"evalue": "'charmap' codec can't decode byte 0x83 in position 49: character maps to <undefined>",
"output_type": "error",
"traceback": [
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[1;31mUnicodeDecodeError\u001B[0m Traceback (most recent call last)",
"Cell \u001B[1;32mIn[27], line 3\u001B[0m\n\u001B[0;32m 1\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mgeograpy\u001B[39;00m\n\u001B[0;32m 2\u001B[0m url \u001B[38;5;241m=\u001B[39m \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mhttps://en.wikipedia.org/wiki/2012_Summer_Olympics_torch_relay\u001B[39m\u001B[38;5;124m'\u001B[39m\n\u001B[1;32m----> 3\u001B[0m places \u001B[38;5;241m=\u001B[39m \u001B[43mgeograpy\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_geoPlace_context\u001B[49m\u001B[43m(\u001B[49m\u001B[43murl\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43murl\u001B[49m\u001B[43m)\u001B[49m\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\__init__.py:24\u001B[0m, in \u001B[0;36mget_geoPlace_context\u001B[1;34m(url, text, debug)\u001B[0m\n\u001B[0;32m 9\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mget_geoPlace_context\u001B[39m(url\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mNone\u001B[39;00m, text\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mNone\u001B[39;00m,debug\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mFalse\u001B[39;00m):\n\u001B[0;32m 10\u001B[0m \u001B[38;5;250m \u001B[39m\u001B[38;5;124;03m'''\u001B[39;00m\n\u001B[0;32m 11\u001B[0m \u001B[38;5;124;03m Get a place context for a given text with information\u001B[39;00m\n\u001B[0;32m 12\u001B[0m \u001B[38;5;124;03m about country, region, city and other\u001B[39;00m\n\u001B[1;32m (...)\u001B[0m\n\u001B[0;32m 22\u001B[0m \u001B[38;5;124;03m PlaceContext: the place context\u001B[39;00m\n\u001B[0;32m 23\u001B[0m \u001B[38;5;124;03m '''\u001B[39;00m \n\u001B[1;32m---> 24\u001B[0m places\u001B[38;5;241m=\u001B[39m\u001B[43mget_place_context\u001B[49m\u001B[43m(\u001B[49m\u001B[43murl\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mtext\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mlabels\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mLabels\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mgeo\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mdebug\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mdebug\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 25\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m places\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\__init__.py:46\u001B[0m, in \u001B[0;36mget_place_context\u001B[1;34m(url, text, labels, debug)\u001B[0m\n\u001B[0;32m 44\u001B[0m e\u001B[38;5;241m.\u001B[39mfind_entities(labels\u001B[38;5;241m=\u001B[39mlabels)\n\u001B[0;32m 45\u001B[0m places\u001B[38;5;241m=\u001B[39me\u001B[38;5;241m.\u001B[39mplaces\n\u001B[1;32m---> 46\u001B[0m pc \u001B[38;5;241m=\u001B[39m \u001B[43mPlaceContext\u001B[49m\u001B[43m(\u001B[49m\u001B[43mplaces\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 47\u001B[0m pc\u001B[38;5;241m.\u001B[39msetAll()\n\u001B[0;32m 48\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m pc\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:32\u001B[0m, in \u001B[0;36mPlaceContext.__init__\u001B[1;34m(self, place_names, setAll, correctMisspelling)\u001B[0m\n\u001B[0;32m 30\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mplaces \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mnormalizePlaces(place_names)\n\u001B[0;32m 31\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m setAll:\n\u001B[1;32m---> 32\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43msetAll\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:90\u001B[0m, in \u001B[0;36mPlaceContext.setAll\u001B[1;34m(self)\u001B[0m\n\u001B[0;32m 88\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mset_regions()\n\u001B[0;32m 89\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mset_cities()\n\u001B[1;32m---> 90\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mset_other\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:206\u001B[0m, in \u001B[0;36mPlaceContext.set_other\u001B[1;34m(self)\u001B[0m\n\u001B[0;32m 203\u001B[0m places \u001B[38;5;241m=\u001B[39m [\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcountries, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcities, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mregions]\n\u001B[0;32m 204\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mall\u001B[39m(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcorrect_country_misspelling(place_name) \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m l \u001B[38;5;28;01mfor\u001B[39;00m l \u001B[38;5;129;01min\u001B[39;00m places)\n\u001B[1;32m--> 206\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mother \u001B[38;5;241m=\u001B[39m [p \u001B[38;5;28;01mfor\u001B[39;00m p \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mplaces \u001B[38;5;28;01mif\u001B[39;00m unused(p)]\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:206\u001B[0m, in \u001B[0;36m<listcomp>\u001B[1;34m(.0)\u001B[0m\n\u001B[0;32m 203\u001B[0m places \u001B[38;5;241m=\u001B[39m [\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcountries, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcities, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mregions]\n\u001B[0;32m 204\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mall\u001B[39m(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcorrect_country_misspelling(place_name) \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m l \u001B[38;5;28;01mfor\u001B[39;00m l \u001B[38;5;129;01min\u001B[39;00m places)\n\u001B[1;32m--> 206\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mother \u001B[38;5;241m=\u001B[39m [p \u001B[38;5;28;01mfor\u001B[39;00m p \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mplaces \u001B[38;5;28;01mif\u001B[39;00m \u001B[43munused\u001B[49m\u001B[43m(\u001B[49m\u001B[43mp\u001B[49m\u001B[43m)\u001B[49m]\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:204\u001B[0m, in \u001B[0;36mPlaceContext.set_other.<locals>.unused\u001B[1;34m(place_name)\u001B[0m\n\u001B[0;32m 202\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21munused\u001B[39m(place_name):\n\u001B[0;32m 203\u001B[0m places \u001B[38;5;241m=\u001B[39m [\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcountries, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcities, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mregions]\n\u001B[1;32m--> 204\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mall\u001B[39;49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcorrect_country_misspelling\u001B[49m\u001B[43m(\u001B[49m\u001B[43mplace_name\u001B[49m\u001B[43m)\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;129;43;01mnot\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[38;5;129;43;01min\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43ml\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43;01mfor\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43ml\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;129;43;01min\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43mplaces\u001B[49m\u001B[43m)\u001B[49m\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\places.py:204\u001B[0m, in \u001B[0;36m<genexpr>\u001B[1;34m(.0)\u001B[0m\n\u001B[0;32m 202\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21munused\u001B[39m(place_name):\n\u001B[0;32m 203\u001B[0m places \u001B[38;5;241m=\u001B[39m [\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcountries, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcities, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mregions]\n\u001B[1;32m--> 204\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mall\u001B[39m(\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcorrect_country_misspelling\u001B[49m\u001B[43m(\u001B[49m\u001B[43mplace_name\u001B[49m\u001B[43m)\u001B[49m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m l \u001B[38;5;28;01mfor\u001B[39;00m l \u001B[38;5;129;01min\u001B[39;00m places)\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\geograpy\\locator.py:1105\u001B[0m, in \u001B[0;36mLocator.correct_country_misspelling\u001B[1;34m(self, name)\u001B[0m\n\u001B[0;32m 1103\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m \u001B[38;5;28mopen\u001B[39m(cur_dir \u001B[38;5;241m+\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m/data/ISO3166ErrorDictionary.csv\u001B[39m\u001B[38;5;124m\"\u001B[39m) \u001B[38;5;28;01mas\u001B[39;00m info:\n\u001B[0;32m 1104\u001B[0m reader \u001B[38;5;241m=\u001B[39m csv\u001B[38;5;241m.\u001B[39mreader(info)\n\u001B[1;32m-> 1105\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m row \u001B[38;5;129;01min\u001B[39;00m reader:\n\u001B[0;32m 1106\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m name \u001B[38;5;241m==\u001B[39m remove_non_ascii(row[\u001B[38;5;241m0\u001B[39m]):\n\u001B[0;32m 1107\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m row[\u001B[38;5;241m2\u001B[39m]\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\encodings\\cp1250.py:23\u001B[0m, in \u001B[0;36mIncrementalDecoder.decode\u001B[1;34m(self, input, final)\u001B[0m\n\u001B[0;32m 22\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mdecode\u001B[39m(\u001B[38;5;28mself\u001B[39m, \u001B[38;5;28minput\u001B[39m, final\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mFalse\u001B[39;00m):\n\u001B[1;32m---> 23\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mcodecs\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcharmap_decode\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43minput\u001B[39;49m\u001B[43m,\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43merrors\u001B[49m\u001B[43m,\u001B[49m\u001B[43mdecoding_table\u001B[49m\u001B[43m)\u001B[49m[\u001B[38;5;241m0\u001B[39m]\n",
"\u001B[1;31mUnicodeDecodeError\u001B[0m: 'charmap' codec can't decode byte 0x83 in position 49: character maps to <undefined>"
]
}
],
"source": [
"import geograpy\n",
"url = 'https://en.wikipedia.org/wiki/2012_Summer_Olympics_torch_relay'\n",
"places = geograpy.get_geoPlace_context(url=url)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 28,
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to\n",
"[nltk_data] C:\\Users\\radvanyi\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package averaged_perceptron_tagger to\n",
"[nltk_data] C:\\Users\\radvanyi\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package averaged_perceptron_tagger is already up-to-\n",
"[nltk_data] date!\n",
"[nltk_data] Downloading package maxent_ne_chunker to\n",
"[nltk_data] C:\\Users\\radvanyi\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package maxent_ne_chunker is already up-to-date!\n",
"[nltk_data] Downloading package words to\n",
"[nltk_data] C:\\Users\\radvanyi\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package words is already up-to-date!\n"
]
},
{
"data": {
"text/plain": "True"
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import geograpy\n",
"import nltk\n",
"nltk.download('punkt')\n",
"nltk.download('averaged_perceptron_tagger')\n",
"nltk.download('maxent_ne_chunker')\n",
"nltk.download('words')"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 24,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"United Kingdom\n",
"United States\n"
]
}
],
"source": [
"# import pycountry\n",
"# text = \"United States (New York), United Kingdom (London)\"\n",
"# for country in pycountry.countries:\n",
"# if country.name in text:\n",
"# print(country.name)"
],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}