{ "cells": [ { "cell_type": "code", "execution_count": 191, "outputs": [], "source": [ "import pandas as pd\n", "# Importing libraries and module and some setting for notebook\n", "\n", "import pandas as pd\n", "import re\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "import numpy as np\n", "from scipy.sparse import csr_matrix\n", "import sparse_dot_topn.sparse_dot_topn as ct #Cosine Similarity\n", "import time\n", "from tqdm import tqdm" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 192, "outputs": [], "source": [ "def wikinorm(univ_string):\n", " from googlesearch import search\n", " from nltk.metrics import edit_distance\n", " from operator import itemgetter\n", " from numpy.random import default_rng\n", " rng = default_rng()\n", " results = search(univ_string, lang=\"en\", num_results=3,advanced=True, sleep_interval=rng.uniform(1, 5))\n", " univ_name = univ_string.split(\",\")[0]\n", " u_results = [i.title for i in results if \"Category:\" not in i.title]\n", " return sorted([tuple((j,edit_distance(univ_name, j))) for j in u_results],key=itemgetter(1))[0][0]\n" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 193, "outputs": [], "source": [ "def replace_uppercase_words(text):\n", " words = text.split()\n", " all_uppercase = all(word.isupper() for word in words)\n", " all_lowercase = all(word.islower() for word in words)\n", " if all_uppercase or all_lowercase:\n", " return text\n", " else:\n", " result = []\n", " for word in words:\n", " w = word.strip()\n", " if not w.isupper() and not w.islower():\n", " result.append(w)\n", " return \" \".join(result).strip()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 194, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO: Pandarallel will run on 4 workers.\n", "INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.\n", "\n", "WARNING: You are on Windows. If you detect any issue with pandarallel, be sure you checked out the Troubleshooting page:\n", "https://nalepae.github.io/pandarallel/troubleshooting/\n" ] }, { "data": { "text/plain": "VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=38767), Label(value='0 / 38767')))…", "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "8551fdcfc52a43108a78c1e91915c681" } }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "155067\n" ] } ], "source": [ "outdir=\"wos_processed_data\"\n", "univ = pd.read_excel(f\"{outdir}/wos_institution_locations.xlsx\")\n", "\n", "from pandarallel import pandarallel\n", "pandarallel.initialize(progress_bar=True, nb_workers=4)\n", "\n", "univ[\"Institution_harm\"] = univ[\"Institution\"].parallel_apply(replace_uppercase_words)\n", "print(len(univ))" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 195, "outputs": [ { "data": { "text/plain": " UT (Unique WOS ID) Institution \n1094 WOS:000292330300050 Hong Kong Polytech Univ \\\n21547 WOS:000374363900001 Guangdong Univ Technol \n53778 WOS:000459846300019 Aarhus Univ \n153776 WOS:000907044000014 Univ Siena \n81562 WOS:000554591602038 China Natl Elect Import Export Corp \n... ... ... \n29206 WOS:000397047200002 Univ Duisburg Essen \n21658 WOS:000374617600020 Univ Southampton \n43289 WOS:000434742800004 Univ Strathclyde \n37200 WOS:000418525100013 Goethe Univ Frankfurt \n95964 WOS:000616310200013 Eindhoven Univ Technol \n\n Country Institution_harm \n1094 China Hong Kong Polytech Univ \n21547 China Guangdong Univ Technol \n53778 Denmark Aarhus Univ \n153776 Italy Univ Siena \n81562 China China Natl Elect Import Export Corp \n... ... ... \n29206 Germany Univ Duisburg Essen \n21658 United Kingdom Univ Southampton \n43289 United Kingdom Univ Strathclyde \n37200 Germany Goethe Univ Frankfurt \n95964 Netherlands Eindhoven Univ Technol \n\n[100 rows x 4 columns]", "text/html": "
\n | UT (Unique WOS ID) | \nInstitution | \nCountry | \nInstitution_harm | \n
---|---|---|---|---|
1094 | \nWOS:000292330300050 | \nHong Kong Polytech Univ | \nChina | \nHong Kong Polytech Univ | \n
21547 | \nWOS:000374363900001 | \nGuangdong Univ Technol | \nChina | \nGuangdong Univ Technol | \n
53778 | \nWOS:000459846300019 | \nAarhus Univ | \nDenmark | \nAarhus Univ | \n
153776 | \nWOS:000907044000014 | \nUniv Siena | \nItaly | \nUniv Siena | \n
81562 | \nWOS:000554591602038 | \nChina Natl Elect Import Export Corp | \nChina | \nChina Natl Elect Import Export Corp | \n
... | \n... | \n... | \n... | \n... | \n
29206 | \nWOS:000397047200002 | \nUniv Duisburg Essen | \nGermany | \nUniv Duisburg Essen | \n
21658 | \nWOS:000374617600020 | \nUniv Southampton | \nUnited Kingdom | \nUniv Southampton | \n
43289 | \nWOS:000434742800004 | \nUniv Strathclyde | \nUnited Kingdom | \nUniv Strathclyde | \n
37200 | \nWOS:000418525100013 | \nGoethe Univ Frankfurt | \nGermany | \nGoethe Univ Frankfurt | \n
95964 | \nWOS:000616310200013 | \nEindhoven Univ Technol | \nNetherlands | \nEindhoven Univ Technol | \n
100 rows × 4 columns
\n\n | Country | \nInstitution_harm | \ncount | \n
---|---|---|---|
12655 | \nPoland | \nSpace Res Ctr | \n6 | \n
12940 | \nPortugal | \nCtr Invest Energia State Grid | \n1 | \n
616 | \nChina | \nMinist Nat Resources | \n78 | \n
5561 | \nChina | \nPowerChina Huadong Engn Corp Ltd | \n1 | \n
514 | \nChina | \nChongqing Univ | \n478 | \n
... | \n... | \n... | \n... | \n
476 | \nBulgaria | \nTech Univ | \n1 | \n
12454 | \nNorway | \nStavanger Univ Hosp | \n9 | \n
5489 | \nChina | \nShanghai Sports Sch | \n1 | \n
768 | \nChina | \nHubei Univ | \n25 | \n
13527 | \nSpain | \nJimenez Diaz Univ Hosp | \n2 | \n
100 rows × 3 columns
\n\n | Country | \nlow_side | \nhigh_side | \nhigh_count | \nlow_count | \nsimilarity | \n
---|---|---|---|---|---|---|
0 | \nChina | \nLogist Univ Chinese Peoples Armed Police Forces | \nLogist Univ Chinese Peoples Armed Police Force | \n1 | \n1 | \n0.988072 | \n
9 | \nChina | \nFlight Automat Control Res Inst | \nXian Flight Automat Control Res Inst | \n1 | \n1 | \n0.905747 | \n
10 | \nChina | \nNorthwest Elect Power Design Inst Co Ltd | \nNorthwest Elect Power Design Inst Co Ltd China | \n1 | \n1 | \n0.926984 | \n
11 | \nChina | \nNorthwest Elect Power Design Inst Co Ltd China | \nNorthwest Elect Power Design Inst Co Ltd | \n1 | \n1 | \n0.926984 | \n
12 | \nChina | \nNorthwest Inst Ecoenvironm & Resources | \nNorthwest Inst Ecoenvironm & Resources Chinese Ac | \n1 | \n1 | \n0.910630 | \n
... | \n... | \n... | \n... | \n... | \n... | \n... | \n
1531 | \nChina | \nChinese Univ Hong Kong Hong | \nChinese Univ Hong Kong | \n728 | \n1 | \n0.935944 | \n
1532 | \nChina | \nHuazhong Univ Sci & Techno | \nHuazhong Univ Sci & Technol | \n729 | \n1 | \n0.989260 | \n
1533 | \nChina | \nHong Kong Polytech Univ Hong Kong | \nHong Kong Polytech Univ | \n809 | \n1 | \n0.917345 | \n
1534 | \nChina | \nKong Kong Polytech Univ | \nHong Kong Polytech Univ | \n809 | \n1 | \n0.939416 | \n
1537 | \nChina | \nUniv Elect Sci & Technol Chin | \nUniv Elect Sci & Technol China | \n1076 | \n1 | \n0.983258 | \n
346 rows × 6 columns
\n\n | UT (Unique WOS ID) | \nInstitution | \nCountry | \nInstitution_harm | \nmerge_iter | \n
---|---|---|---|---|---|
244 | \nWOS:000286472300003 | \nUniv Trent | \nItaly | \nUniv Trento | \n1 | \n
364 | \nWOS:000287586100011 | \nUniv Trent | \nItaly | \nUniv Trento | \n1 | \n
410 | \nWOS:000287939200011 | \nAbdus Salam Int Ctr Theoret Phys | \nItaly | \nAbdus Salaam Int Ctr Theoret Phys | \n1 | \n
765 | \nWOS:000290996200002 | \nUniv Trent | \nItaly | \nUniv Trento | \n1 | \n
907 | \nWOS:000291698400013 | \nINFN Sez Roma 1 | \nItaly | \nSez Roma | \n1 | \n
... | \n... | \n... | \n... | \n... | \n... | \n
153063 | \nWOS:000900129900175 | \nUniv Rome Campus Biomed Aquila | \nItaly | \nUniv Rome Campus Biomed Aquila | \n2 | \n
154775 | \nWOS:000929737300001 | \nPrevent & Res Inst | \nItaly | \nPrevent & Res Inst | \n2 | \n
154813 | \nWOS:000929737300001 | \nIst Super Sanit | \nItaly | \nIst Super Sanita | \n1 | \n
154855 | \nWOS:000933331200004 | \nUniv Federio II | \nItaly | \nUniv Federico | \n1 | \n
154857 | \nWOS:000933331200004 | \nINAF Osservatorio Astron Capodimonte | \nItaly | \nOsserv Astron Capodimonte | \n1 | \n
375 rows × 5 columns
\n