{ "cells": [ { "cell_type": "code", "execution_count": 1, "outputs": [], "source": [ "import pandas as pd\n", "# Importing libraries and module and some setting for notebook\n", "\n", "import pandas as pd\n", "import re\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "import numpy as np\n", "from scipy.sparse import csr_matrix\n", "import sparse_dot_topn.sparse_dot_topn as ct #Cosine Similarity\n", "import time\n", "from tqdm import tqdm" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 2, "outputs": [], "source": [ "def wikinorm(univ_string):\n", " from googlesearch import search\n", " from nltk.metrics import edit_distance\n", " from operator import itemgetter\n", " from numpy.random import default_rng\n", " rng = default_rng()\n", " results = search(univ_string, lang=\"en\", num_results=3,advanced=True, sleep_interval=rng.uniform(1, 5))\n", " univ_name = univ_string.split(\",\")[0]\n", " u_results = [i.title for i in results if \"Category:\" not in i.title]\n", " return sorted([tuple((j,edit_distance(univ_name, j))) for j in u_results],key=itemgetter(1))[0][0]\n" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 3, "outputs": [], "source": [ "def replace_uppercase_words(text):\n", " words = text.split()\n", " all_uppercase = all(word.isupper() for word in words)\n", " all_lowercase = all(word.islower() for word in words)\n", " if all_uppercase or all_lowercase:\n", " return text\n", " else:\n", " result = []\n", " for word in words:\n", " w = word.strip()\n", " if not w.isupper() and not w.islower():\n", " result.append(w)\n", " return \" \".join(result).strip()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 4, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO: Pandarallel will run on 4 workers.\n", "INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.\n", "\n", "WARNING: You are on Windows. If you detect any issue with pandarallel, be sure you checked out the Troubleshooting page:\n", "https://nalepae.github.io/pandarallel/troubleshooting/\n" ] }, { "data": { "text/plain": "VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=44660), Label(value='0 / 44660')))…", "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "5f8bead5565146a5843c01b81b77cf9f" } }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "178638\n" ] } ], "source": [ "outdir=\"wos_processed_data\"\n", "univ = pd.read_excel(f\"{outdir}/wos_institution_locations.xlsx\")\n", "\n", "from pandarallel import pandarallel\n", "pandarallel.initialize(progress_bar=True, nb_workers=4)\n", "\n", "univ[\"Institution_harm\"] = univ[\"Institution\"].parallel_apply(replace_uppercase_words)\n", "print(len(univ))" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 5, "outputs": [ { "data": { "text/plain": " UT (Unique WOS ID) Institution \n149037 WOS:000764953300001 Univ Elect Sci & Technol China \\\n86834 WOS:000519526500027 Radboud Univ Nijmegen \n143915 WOS:000739917304088 Swiss Fed Inst Technol \n135117 WOS:000707680800001 North China Elect Power Univ Beijing \n110390 WOS:000605608700001 Imperial Coll London \n... ... ... \n21250 WOS:000358912300001 Jilin Univ \n23018 WOS:000364230600002 Tampere Univ Technol \n126847 WOS:000675855300001 Univ Copenhagen \n15313 WOS:000343701400001 Univ Siena \n77834 WOS:000490147400012 Tsinghua Univ \n\n Country Institution_harm \n149037 China Univ Elect Sci & Technol China \n86834 Netherlands Radboud Univ Nijmegen \n143915 Switzerland Swiss Fed Inst Technol \n135117 China North China Elect Power Univ Beijing \n110390 United Kingdom Imperial Coll London \n... ... ... \n21250 China Jilin Univ \n23018 Finland Tampere Univ Technol \n126847 Denmark Univ Copenhagen \n15313 Italy Univ Siena \n77834 China Tsinghua Univ \n\n[100 rows x 4 columns]", "text/html": "
\n | UT (Unique WOS ID) | \nInstitution | \nCountry | \nInstitution_harm | \n
---|---|---|---|---|
149037 | \nWOS:000764953300001 | \nUniv Elect Sci & Technol China | \nChina | \nUniv Elect Sci & Technol China | \n
86834 | \nWOS:000519526500027 | \nRadboud Univ Nijmegen | \nNetherlands | \nRadboud Univ Nijmegen | \n
143915 | \nWOS:000739917304088 | \nSwiss Fed Inst Technol | \nSwitzerland | \nSwiss Fed Inst Technol | \n
135117 | \nWOS:000707680800001 | \nNorth China Elect Power Univ Beijing | \nChina | \nNorth China Elect Power Univ Beijing | \n
110390 | \nWOS:000605608700001 | \nImperial Coll London | \nUnited Kingdom | \nImperial Coll London | \n
... | \n... | \n... | \n... | \n... | \n
21250 | \nWOS:000358912300001 | \nJilin Univ | \nChina | \nJilin Univ | \n
23018 | \nWOS:000364230600002 | \nTampere Univ Technol | \nFinland | \nTampere Univ Technol | \n
126847 | \nWOS:000675855300001 | \nUniv Copenhagen | \nDenmark | \nUniv Copenhagen | \n
15313 | \nWOS:000343701400001 | \nUniv Siena | \nItaly | \nUniv Siena | \n
77834 | \nWOS:000490147400012 | \nTsinghua Univ | \nChina | \nTsinghua Univ | \n
100 rows × 4 columns
\n\n | Country | \nInstitution_harm | \ncount | \n
---|---|---|---|
496 | \nBelgium | \nHaute Ecole Louvain Hainaut | \n1 | \n
10566 | \nGermany | \nIQM | \n4 | \n
6670 | \nChina | \nTiantan Hosp | \n1 | \n
16974 | \nSwitzerland | \nSpecies Survival Commiss Mushroom Bracket & | \n1 | \n
9200 | \nFrance | \nHop Hotel Dieu Paris | \n2 | \n
... | \n... | \n... | \n... | \n
11326 | \nGermany | \nInt Max Planck Res Sch Earth Syst Modeling | \n1 | \n
2874 | \nChina | \nChina Natl Nucl Corp 416 Hosp | \n2 | \n
17967 | \nUnited Kingdom | \nAccelerComm Ltd | \n1 | \n
14041 | \nNetherlands | \nVankeulen Advies | \n1 | \n
4148 | \nChina | \nFirst Peoples Hosp Jingmen Affiliated Hubei Minzu | \n1 | \n
100 rows × 3 columns
\n\n | UT (Unique WOS ID) | \nInstitution | \nCountry | \nInstitution_harm | \nmerge_iter | \n
---|---|---|---|---|---|
124019 | \nWOS:000663304500011 | \nExcellium Consulting | \nUnited Kingdom | \nExcellium Consulting | \n0 | \n
126511 | \nWOS:000674472400006 | \nTongji Univ | \nChina | \nTongji Univ | \n0 | \n
33359 | \nWOS:000391252900006 | \nBeihang Univ | \nChina | \nBeihang Univ | \n0 | \n
153729 | \nWOS:000787596500003 | \nFudan Univ | \nChina | \nFudan Univ | \n0 | \n
69317 | \nWOS:000467564700105 | \nGuangdong Univ Technol | \nChina | \nGuangdong Univ Technol | \n0 | \n
... | \n... | \n... | \n... | \n... | \n... | \n
160384 | \nWOS:000812531900012 | \nZhengzhou Univ | \nChina | \nZhengzhou Univ | \n0 | \n
133501 | \nWOS:000702637000007 | \nUniv Luxembourg | \nLuxembourg | \nUniv Luxembourg | \n0 | \n
140349 | \nWOS:000728149000027 | \nRegina Montis Regalis Hosp | \nItaly | \nRegina Montis Regalis Hosp | \n0 | \n
160632 | \nWOS:000813959600003 | \nLiyang Peoples Hosp | \nChina | \nLiyang Peoples Hosp | \n0 | \n
174735 | \nWOS:000888555700002 | \nSwiss Fed Inst Technol | \nSwitzerland | \nSwiss Fed Inst Technol | \n0 | \n
500 rows × 5 columns
\n