{ "cells": [ { "cell_type": "code", "execution_count": 7, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import os\n", "import shutil\n", "from flashgeotext.geotext import GeoText\n", "import re\n", "# import spacy\n", "#\n", "# nlp = spacy.load(\"en_core_web_lg\")" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "record_col=\"UT (Unique WOS ID)\"\n", "outfile = r\"C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\WOS\\wos_extract\\wos_records_concat.csv\"" ] }, { "cell_type": "code", "execution_count": 8, "outputs": [], "source": [], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of initial records: 27672\n", "Number of filtered records: 24653\n" ] } ], "source": [ "wos = pd.read_csv(outfile, sep=\"\\t\",low_memory=False)\n", "print(f'Number of initial records: {len(wos)}')\n", "metrix = pd.read_excel(\"sm_journal_classification.xlsx\", sheet_name=\"Journal_Classification\")\n", "\n", "\n", "metrix = metrix.set_index([c for c in metrix.columns if \"issn\" not in c]).stack().reset_index()\n", "metrix = metrix.rename(columns={'level_6':\"issn_type\", 0:\"issn\"})\n", "metrix[\"issn\"]=metrix[\"issn\"].str.replace(\"-\",\"\").str.lower().str.strip()\n", "\n", "wos[\"issn\"] = wos[\"ISSN\"].str.replace(\"-\",\"\").str.lower().str.strip()\n", "wos[\"eissn\"] = wos[\"eISSN\"].str.replace(\"-\",\"\").str.lower().str.strip()\n", "wos = wos.set_index([c for c in wos.columns if \"issn\" not in c]).stack().reset_index()\n", "wos = wos.rename(columns={'level_71':\"issn_var\", 0:\"issn\"})\n", "\n", "wos_merge = wos.merge(metrix, on=\"issn\", how=\"left\")\n", "wos = wos_merge.sort_values(by=\"issn_var\",ascending=False).drop_duplicates(subset=record_col)\n", "\n", "# drop entries not indexed by metrix\n", "wos = wos[~wos[\"Domain_English\"].isna()]\n", "# drop duplicates (based on doi)\n", "wos = wos[~((~wos[\"DOI\"].isna())&(wos[\"DOI\"].duplicated(False)))]\n", "wos = wos.drop_duplicates(subset=[\"Publication Type\",\"Document Type\",\"Authors\",\"Article Title\",\"Source Title\",\"Publication Year\"])\n", "print(f'Number of filtered records: {len(wos)}')" ] }, { "cell_type": "code", "execution_count": 10, "outputs": [ { "data": { "text/plain": " Article Title \n23070 Stochastic bias of colour-selected BAO tracers... \\\n30139 A novel integrative approach elucidates fine-s... \n4538 Optimal Number of Clusters by Measuring Simila... \n34242 Analyzing the Noise Robustness of Deep Neural ... \n26727 Learning to Prompt for Open-Vocabulary Object ... \n... ... \n3290 Research on Reverse Skyline Query Algorithm Ba... \n45159 Using Recurrent Neural Network for Intelligent... \n21653 Output-Bounded and RBFNN-Based Position Tracki... \n43983 A Novel 3D Intelligent Cluster Method for Mali... \n11880 BlockHammer: Improving Flash Reliability by Ex... \n\n Keywords Plus \n23070 DIGITAL SKY SURVEY; BARYON ACOUSTIC-OSCILLATIO... \\\n30139 CHAOTIC GENETIC PATCHINESS; PELAGIC LARVAL DUR... \n4538 VALIDATION; ALGORITHM; TUTORIAL \n34242 VISUAL ANALYTICS \n26727 NaN \n... ... \n3290 MAPREDUCE \n45159 NaN \n21653 IMPEDANCE CONTROL; ROBOT \n43983 NETWORK INTRUSION DETECTION; DDOS DETECTION; A... \n11880 MEMORY; PERFORMANCE; RETENTION; ENDURANCE; OPT... \n\n Author Keywords \n23070 galaxies: evolution; galaxies: haloes; galaxie... \n30139 NaN \n4538 Event-related potentials; Optimal number of cl... \n34242 Neurons; Visualization; Data visualization; Fe... \n26727 NaN \n... ... \n3290 Big Data; Database Management; Database Query;... \n45159 water resources; intelligent prediction; water... \n21653 Security tele-surgery; RBFNN; bilateral positi... \n43983 Auto encoder; DDos detection; Attack classific... \n11880 Reliability; Three-dimensional displays; Error... \n\n[100 rows x 3 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Article TitleKeywords PlusAuthor Keywords
23070Stochastic bias of colour-selected BAO tracers...DIGITAL SKY SURVEY; BARYON ACOUSTIC-OSCILLATIO...galaxies: evolution; galaxies: haloes; galaxie...
30139A novel integrative approach elucidates fine-s...CHAOTIC GENETIC PATCHINESS; PELAGIC LARVAL DUR...NaN
4538Optimal Number of Clusters by Measuring Simila...VALIDATION; ALGORITHM; TUTORIALEvent-related potentials; Optimal number of cl...
34242Analyzing the Noise Robustness of Deep Neural ...VISUAL ANALYTICSNeurons; Visualization; Data visualization; Fe...
26727Learning to Prompt for Open-Vocabulary Object ...NaNNaN
............
3290Research on Reverse Skyline Query Algorithm Ba...MAPREDUCEBig Data; Database Management; Database Query;...
45159Using Recurrent Neural Network for Intelligent...NaNwater resources; intelligent prediction; water...
21653Output-Bounded and RBFNN-Based Position Tracki...IMPEDANCE CONTROL; ROBOTSecurity tele-surgery; RBFNN; bilateral positi...
43983A Novel 3D Intelligent Cluster Method for Mali...NETWORK INTRUSION DETECTION; DDOS DETECTION; A...Auto encoder; DDos detection; Attack classific...
11880BlockHammer: Improving Flash Reliability by Ex...MEMORY; PERFORMANCE; RETENTION; ENDURANCE; OPT...Reliability; Three-dimensional displays; Error...
\n

100 rows × 3 columns

\n
" }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wos[[\"Article Title\",\"Keywords Plus\",\"Author Keywords\"]].sample(100)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 11, "outputs": [ { "data": { "text/plain": " UT (Unique WOS ID) keyword_all\n0 WOS:000208863600266 ANME\n1 WOS:000208863600266 PYROSEQUENCING\n2 WOS:000208863600266 AOM\n3 WOS:000208863600266 COMMUNITY STRUCTURE\n4 WOS:000208863600266 NYEGGA\n.. ... ...\n99 WOS:000286328200009 NORTH-EAST ASIA\n100 WOS:000286328200009 PLEISTOCENE\n101 WOS:000286328200009 SAKHALIN ISLAND\n102 WOS:000286373200134 NEURAL NETWORKS\n103 WOS:000286373200134 FUZZY LOGIC\n\n[100 rows x 2 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
UT (Unique WOS ID)keyword_all
0WOS:000208863600266ANME
1WOS:000208863600266PYROSEQUENCING
2WOS:000208863600266AOM
3WOS:000208863600266COMMUNITY STRUCTURE
4WOS:000208863600266NYEGGA
.........
99WOS:000286328200009NORTH-EAST ASIA
100WOS:000286328200009PLEISTOCENE
101WOS:000286328200009SAKHALIN ISLAND
102WOS:000286373200134NEURAL NETWORKS
103WOS:000286373200134FUZZY LOGIC
\n

100 rows × 2 columns

\n
" }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kw_df = pd.DataFrame()\n", "for c in [\"Keywords Plus\",\"Author Keywords\"]:\n", " kwp = wos.groupby(record_col)[c].apply(lambda x: x.str.split(';')).explode().str.strip().str.upper()\n", " kwp.name = 'keyword_all'\n", " kw_df = pd.concat([kwp.reset_index(),kw_df],ignore_index=True)\n", "kw_df = kw_df[~kw_df[\"keyword_all\"].isna()].copy().drop(columns=\"level_1\").drop_duplicates()\n", "kw_df[\"keyword_all\"] = kw_df[\"keyword_all\"].apply(lambda x: re.sub(\"[\\(\\[].*?[\\)\\]]\", \"\", x))\n", "kw_df.head(100)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 12, "outputs": [ { "data": { "text/plain": " UT (Unique WOS ID) keyword_all\n0 WOS:000208863600266 ANME; PYROSEQUENCING; AOM; COMMUNITY STRUCTURE...\n1 WOS:000209236900010 ACTIVE PERCEPTION; SPARSE CODING; REINFORCEMEN...\n2 WOS:000209331600009 SLEEP PATTERN; ELDER-CARE; PRESSURE SENSOR; NA...\n3 WOS:000209571700012 PERSONALIZED MEDICINE; COMPLEX NETWORK; CLINIC...\n4 WOS:000209810700046 CORROSION CHARACTERIZATION; FEATURE EXTRACTION...", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
UT (Unique WOS ID)keyword_all
0WOS:000208863600266ANME; PYROSEQUENCING; AOM; COMMUNITY STRUCTURE...
1WOS:000209236900010ACTIVE PERCEPTION; SPARSE CODING; REINFORCEMEN...
2WOS:000209331600009SLEEP PATTERN; ELDER-CARE; PRESSURE SENSOR; NA...
3WOS:000209571700012PERSONALIZED MEDICINE; COMPLEX NETWORK; CLINIC...
4WOS:000209810700046CORROSION CHARACTERIZATION; FEATURE EXTRACTION...
\n
" }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wos_kwd_concat = kw_df.groupby(record_col, as_index=False).agg({'keyword_all': '; '.join})\n", "wos_kwd_concat.head()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 13, "outputs": [], "source": [ "# from keybert import KeyBERT\n", "#\n", "# kw_model = KeyBERT(model='all-mpnet-base-v2')\n", "#\n", "# def kwd_extract(text):\n", "# keywords = kw_model.extract_keywords(text,\n", "#\n", "# keyphrase_ngram_range=(1, 2),\n", "#\n", "# stop_words='english',\n", "#\n", "# highlight=False,\n", "#\n", "# top_n=3)\n", "# return \"; \".join([i[0].upper() for i in keywords])\n", "#\n", "# kwd_extract(text=\"Artificial Intelligence: New Frontiers in Real-Time Inverse Scattering and Electromagnetic Imaging - In recent years, artificial intelligence (AI) techniques have been developed rapidly. With the ...\")" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 13, "outputs": [], "source": [], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 14, "outputs": [ { "data": { "text/plain": "Index(['Publication Type', 'Authors', 'Book Authors', 'Book Editors',\n 'Book Group Authors', 'Author Full Names', 'Book Author Full Names',\n 'Group Authors', 'Article Title', 'Source Title', 'Book Series Title',\n 'Book Series Subtitle', 'Language', 'Document Type', 'Conference Title',\n 'Conference Date', 'Conference Location', 'Conference Sponsor',\n 'Conference Host', 'Author Keywords', 'Keywords Plus', 'Abstract',\n 'Addresses', 'Affiliations', 'Reprint Addresses', 'Email Addresses',\n 'Researcher Ids', 'ORCIDs', 'Funding Orgs', 'Funding Name Preferred',\n 'Funding Text', 'Cited References', 'Cited Reference Count',\n 'Times Cited, WoS Core', 'Times Cited, All Databases',\n '180 Day Usage Count', 'Since 2013 Usage Count', 'Publisher',\n 'Publisher City', 'Publisher Address', 'ISSN', 'eISSN', 'ISBN',\n 'Journal Abbreviation', 'Journal ISO Abbreviation', 'Publication Date',\n 'Publication Year', 'Volume', 'Issue', 'Part Number', 'Supplement',\n 'Special Issue', 'Meeting Abstract', 'Start Page', 'End Page',\n 'Article Number', 'DOI', 'DOI Link', 'Book DOI', 'Early Access Date',\n 'Number of Pages', 'WoS Categories', 'Web of Science Index',\n 'Research Areas', 'IDS Number', 'Pubmed Id', 'Open Access Designations',\n 'Highly Cited Status', 'Hot Paper Status', 'Date of Export',\n 'UT (Unique WOS ID)', 'issn_var', 'issn', 'Domain_English',\n 'Field_English', 'SubField_English', '2.00 SEQ', 'Source_title',\n 'srcid', 'issn_type'],\n dtype='object')" }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wos.columns" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "geotext = GeoText()\n", "\n", "def extract_location(input_text, key='countries'):\n", " anomalies = {\"Malta\":\"Malta\",\n", " \"Mongolia\":\"Mongolia\",\n", " \"Quatar\":\"Qatar\",\n", " \"Qatar\":\"Qatar\",\n", " \"Ethiop\":\"Ethiopia\",\n", " \"Nigeria\":\"Nigeria\",\n", " \"BELAR\":\"Belarus\",\n", " \"Venezuela\":\"Venezuela\",\n", " \"Cyprus\":\"Cyprus\",\n", " \"Ecuador\":\"Ecuador\",\n", " \"U Arab\":\"United Arab Emirates\",\n", " \"Syria\":\"Syria\",\n", " \"Uganda\":\"Uganda\",\n", " \"Yemen\":\"Yemen\",\n", " \"Mali\":\"Mali\",\n", " \"Senegal\":\"Senegal\",\n", " \"Vatican\":\"Vatican\",\n", " \"Uruguay\":\"Uruguay\",\n", " \"Panama\":\"Panama\",\n", " \"Fiji\":\"Fiji\",\n", " \"Faroe\":\"Faroe Islands\",\n", " \"Macedonia\":\"Macedonia\",\n", " 'Mozambique':'Mozambique',\n", " \"Kuwait\":\"Kuwait\",\n", " \"Libya\":\"Libya\",\n", " \"Turkiy\":\"Turkey\",\n", " \"Liberia\":\"Liberia\",\n", " \"Namibia\":\"Namibia\",\n", " \"Ivoire\":\"Ivory Coast\",\n", " \"Guatemala\":\"Gutemala\",\n", " \"Paraguay\":\"Paraguay\",\n", " \"Honduras\":\"Honduras\",\n", " \"Nicaragua\":\"Nicaragua\",\n", " \"Trinidad\":\"Trinidad & Tobago\",\n", " \"Liechtenstein\":\"Liechtenstein\",\n", " \"Greenland\":\"Denmark\"}\n", "\n", " extracted = geotext.extract(input_text=input_text)\n", " found = extracted[key].keys()\n", " if len(sorted(found))>0:\n", " return sorted(found)[0]\n", " elif key=='countries':\n", " for i in ['Scotland','Wales','England', 'N Ireland']:\n", " if i in input_text:\n", " return 'United Kingdom'\n", " for j in anomalies.keys():\n", " if j in input_text:\n", " return anomalies.get(j)\n", " else:\n", " return None\n", "\n", "with open('../eu_members.txt',\"r\") as f:\n", " eu_countries=f.readline().split(\",\")\n", " eu_countries=[i.strip() for i in eu_countries]\n", "\n", "def country_type(country):\n", " if country in eu_countries:\n", " return \"EU\"\n", " elif country==\"China\":\n", " return \"China\"\n", " elif country in [\"Switzerland\", 'Norway','United Kingdom']:\n", " return \"Non-EU associate\"\n", " else:\n", " return \"Other\"\n" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "locations = wos.groupby(record_col)[\"Addresses\"].apply(lambda x: x.str.split('[')).explode().reset_index().drop(columns=\"level_1\")\n", "locations = locations[locations[\"Addresses\"]!=\"\"].copy()\n", "locations[\"Address\"] = locations[\"Addresses\"].apply(lambda x:x.split(\"]\")[-1])\n", "locations[\"Authors_of_address\"] = locations[\"Addresses\"].apply(lambda x:x.split(\"]\")[0])\n", "locations[\"Country\"]=locations['Address'].apply(lambda x: extract_location(input_text=x, key='countries'))\n", "locations[\"City\"]=locations['Address'].apply(lambda x: extract_location(input_text=x, key='cities'))\n", "locations[\"Country_Type\"] = locations[\"Country\"].apply(lambda x: country_type(x))" ] }, { "cell_type": "code", "execution_count": 32, "outputs": [], "source": [ "scope_types = [\"EU\",\"China\",\"Non-EU associate\"]\n", "locations=locations[locations[\"Country_Type\"].isin(scope_types)]" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "data": { "text/plain": " UT (Unique WOS ID) Address \n1 WOS:000208863600266 Univ Bergen, Ctr Geobiol, Dept Biol, N-5020 B... \\\n2 WOS:000208863600266 Chinese Acad Sci, Guangzhou Inst Geochem, Gua... \n3 WOS:000208863600266 Univ Bergen, Dept Earth Sci, N-5020 Bergen, N... \n5 WOS:000209236900010 Goethe Univ Frankfurt, Frankfurt Inst Adv Stu... \n6 WOS:000209236900010 Ecole Normale Super Cachan Bretagne, Bruz, Fr... \n\n Country City Country_Type \n1 Norway Bergen Non-EU associate \\\n2 China Guangzhou China \n3 Norway Bergen Non-EU associate \n5 Germany Frankfurt (Oder) EU \n6 France Cachan EU \n\n Institution \n1 Univ Bergen \n2 Chinese Acad Sci \n3 Univ Bergen \n5 Goethe Univ Frankfurt \n6 Ecole Normale Super Cachan Bretagne ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
UT (Unique WOS ID)AddressCountryCityCountry_TypeInstitution
1WOS:000208863600266Univ Bergen, Ctr Geobiol, Dept Biol, N-5020 B...NorwayBergenNon-EU associateUniv Bergen
2WOS:000208863600266Chinese Acad Sci, Guangzhou Inst Geochem, Gua...ChinaGuangzhouChinaChinese Acad Sci
3WOS:000208863600266Univ Bergen, Dept Earth Sci, N-5020 Bergen, N...NorwayBergenNon-EU associateUniv Bergen
5WOS:000209236900010Goethe Univ Frankfurt, Frankfurt Inst Adv Stu...GermanyFrankfurt (Oder)EUGoethe Univ Frankfurt
6WOS:000209236900010Ecole Normale Super Cachan Bretagne, Bruz, Fr...FranceCachanEUEcole Normale Super Cachan Bretagne
\n
" }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "univ_locations = locations[[record_col,\"Address\",\"Country\",\"City\",\"Country_Type\"]].copy()\n", "univ_locations[\"Institution\"] = univ_locations[\"Address\"].apply(lambda x: x.split(\",\")[0])\n", "univ_locations = univ_locations.drop_duplicates()\n", "univ_locations.head()" ] }, { "cell_type": "code", "execution_count": 38, "outputs": [], "source": [ "import hashlib\n", "\n", "def md5hash(s: str):\n", " return hashlib.md5(s.encode('utf-8')).hexdigest()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": " UT (Unique WOS ID) Country Country_Type \n0 WOS:000208863600266 China China \\\n1 WOS:000208863600266 Norway Non-EU associate \n2 WOS:000208863600266 Norway Non-EU associate \n3 WOS:000208863600266 Norway Non-EU associate \n4 WOS:000208863600266 Norway Non-EU associate \n\n author_str_id \n0 5dfb4f0408a2cc8b7f36f5516938b62c \n1 d603b89121a1f279bf03b6f65d1389fa \n2 2fcb84e544f1558ead61dcf846027b7d \n3 6550a1d5fbd1b643f4732d40f2ed4d78 \n4 56485e2bd170d199887af88f3d0a9777 ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
UT (Unique WOS ID)CountryCountry_Typeauthor_str_id
0WOS:000208863600266ChinaChina5dfb4f0408a2cc8b7f36f5516938b62c
1WOS:000208863600266NorwayNon-EU associated603b89121a1f279bf03b6f65d1389fa
2WOS:000208863600266NorwayNon-EU associate2fcb84e544f1558ead61dcf846027b7d
3WOS:000208863600266NorwayNon-EU associate6550a1d5fbd1b643f4732d40f2ed4d78
4WOS:000208863600266NorwayNon-EU associate56485e2bd170d199887af88f3d0a9777
\n
" }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "author_locations = locations.groupby([record_col,\"Country\",\"Country_Type\"])[\"Authors_of_address\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_3\")\n", "author_locations[\"Author_name\"] = author_locations[\"Authors_of_address\"].str.strip()\n", "author_locations = author_locations.drop(columns=\"Authors_of_address\")\n", "author_locations[\"author_str_id\"] = author_locations[\"Author_name\"].apply(lambda x:''.join(filter(str.isalnum, x.lower())))\n", "author_locations[\"author_str_id\"] = author_locations[\"author_str_id\"].apply(md5hash)\n", "author_locations = author_locations.drop(columns=\"Author_name\")\n", "author_locations.head()" ] }, { "cell_type": "code", "execution_count": 44, "outputs": [ { "data": { "text/plain": " UT (Unique WOS ID) Country Country_Type \n0 WOS:000208863600266 China China \\\n6 WOS:000209236900010 China China \n7 WOS:000209236900010 China China \n8 WOS:000209236900010 France EU \n10 WOS:000209236900010 Germany EU \n... ... ... ... \n321236 WOS:000953367000001 China China \n321237 WOS:000953367000001 China China \n321238 WOS:000953367000001 China China \n321239 WOS:000953367000001 China China \n321241 WOS:000953367000001 United Kingdom Non-EU associate \n\n author_str_id \n0 5dfb4f0408a2cc8b7f36f5516938b62c \n6 b406b8485c286091a46aca4999f294d3 \n7 abf37b879540b7c2eeb86787a467de29 \n8 2c559a54c654ab6dbc23d20ae82a0501 \n10 2c559a54c654ab6dbc23d20ae82a0501 \n... ... \n321236 99ef5c82ba66e07f9aa2d3f9fc7c45f7 \n321237 d013bf53d094540f90db9224b3eb9922 \n321238 702962f6fe47bac08520ae556a8e0e02 \n321239 99ef5c82ba66e07f9aa2d3f9fc7c45f7 \n321241 9cc42be570a5464bca0ea4b6b39d0271 \n\n[277884 rows x 4 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
UT (Unique WOS ID)CountryCountry_Typeauthor_str_id
0WOS:000208863600266ChinaChina5dfb4f0408a2cc8b7f36f5516938b62c
6WOS:000209236900010ChinaChinab406b8485c286091a46aca4999f294d3
7WOS:000209236900010ChinaChinaabf37b879540b7c2eeb86787a467de29
8WOS:000209236900010FranceEU2c559a54c654ab6dbc23d20ae82a0501
10WOS:000209236900010GermanyEU2c559a54c654ab6dbc23d20ae82a0501
...............
321236WOS:000953367000001ChinaChina99ef5c82ba66e07f9aa2d3f9fc7c45f7
321237WOS:000953367000001ChinaChinad013bf53d094540f90db9224b3eb9922
321238WOS:000953367000001ChinaChina702962f6fe47bac08520ae556a8e0e02
321239WOS:000953367000001ChinaChina99ef5c82ba66e07f9aa2d3f9fc7c45f7
321241WOS:000953367000001United KingdomNon-EU associate9cc42be570a5464bca0ea4b6b39d0271
\n

277884 rows × 4 columns

\n
" }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "author_locations[author_locations['author_str_id'].duplicated(False)]" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "author_primary_region = author_locations.sort_values(by=\"Country_Type\").drop_duplicates(subset=[record_col,\"author_str_id\"])\n", "# author_primary_region\n", "\n", "china=author_primary_region[author_primary_region[\"Country_Type\"]==\"China\"][record_col].unique()\n", "eu=author_primary_region[author_primary_region[\"Country_Type\"]==\"EU\"][record_col].unique()\n", "assoc=author_primary_region[author_primary_region[\"Country_Type\"]==\"Non-EU associate\"][record_col].unique()\n", "\n", "\n", "# records that have distinct authors with different country affiliations\n", "valid_scope = wos[((wos[record_col].isin(china))\n", " &\n", " ((wos[record_col].isin(eu))\n", " |\n", " (wos[record_col].isin(assoc))))][record_col].unique()" ] }, { "cell_type": "code", "execution_count": 54, "outputs": [ { "data": { "text/plain": " UT (Unique WOS ID) Country Country_Type \n0 WOS:000208863600266 China China \\\n299168 WOS:000840488600001 China China \n299169 WOS:000840488600001 China China \n101376 WOS:000434663200012 China China \n101374 WOS:000434663200012 China China \n\n author_str_id \n0 5dfb4f0408a2cc8b7f36f5516938b62c \n299168 3462304c908993a828cdd0ff91ea4aaa \n299169 68ab59c442eb882af13a8273439cf840 \n101376 304c36b8b677f41a489894dc66a8461c \n101374 c04795fe195dcadb58bed5c81125ea35 ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
UT (Unique WOS ID)CountryCountry_Typeauthor_str_id
0WOS:000208863600266ChinaChina5dfb4f0408a2cc8b7f36f5516938b62c
299168WOS:000840488600001ChinaChina3462304c908993a828cdd0ff91ea4aaa
299169WOS:000840488600001ChinaChina68ab59c442eb882af13a8273439cf840
101376WOS:000434663200012ChinaChina304c36b8b677f41a489894dc66a8461c
101374WOS:000434663200012ChinaChinac04795fe195dcadb58bed5c81125ea35
\n
" }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "author_primary_region.head()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of records: 24653\n", "Number of valid records: 22081\n" ] } ], "source": [ "print(f'Number of records: {len(wos)}')\n", "print(f'Number of valid cooperation records: {len(valid_scope)}')" ] }, { "cell_type": "code", "execution_count": 66, "outputs": [], "source": [ "wos = wos[wos[record_col].isin(valid_scope)]\n", "locations = locations[locations[record_col].isin(valid_scope)]\n", "univ_locations = univ_locations[univ_locations[record_col].isin(valid_scope)]\n", "author_locations = author_locations[author_locations[record_col].isin(valid_scope)]\n", "author_primary_region = author_locations[author_locations[record_col].isin(valid_scope)]" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [], "source": [ "affiliations = wos.groupby(record_col)[\"Affiliations\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n", "affiliations[\"Affiliations\"] = affiliations[\"Affiliations\"].str.strip().str.upper().fillna(\"UNKNOWN\")\n", "affiliations = affiliations.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 67, "outputs": [ { "data": { "text/plain": "Affiliations\nCHINESE ACADEMY OF SCIENCES 2688\nUNIVERSITY OF LONDON 1251\nUDICE-FRENCH RESEARCH UNIVERSITIES 1038\nCENTRE NATIONAL DE LA RECHERCHE SCIENTIFIQUE (CNRS) 978\nTSINGHUA UNIVERSITY 960\n ... \nITALIAN INSTITUTE FOR GENOMIC MEDICINE (IIGM) 1\nSHENYANG INSTITUTE OF ENGINEERING 1\nXIANYANG NORMAL UNIVERSITY 1\nAGILENT TECHNOLOGIES 1\nUNIVERSIDAD DE ESPECIALIDADES ESPIRITU SANTO 1\nName: count, Length: 6117, dtype: int64" }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "affiliations[\"Affiliations\"].value_counts()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 68, "outputs": [ { "data": { "text/plain": "Institution\n Chinese Acad Sci 2708\n Tsinghua Univ 1170\n Shanghai Jiao Tong Univ 978\n Zhejiang Univ 902\n Univ Chinese Acad Sci 753\n ... \n Univ Namur 1\n Qianhai Inst Innovat Res 1\n UN 1\n Vienna Int Ctr 1\n Engn Res Ctr Urban Underground Space Dev Zhejiang 1\nName: count, Length: 11670, dtype: int64" }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "univ_locations[\"Institution\"].value_counts()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 61, "outputs": [ { "data": { "text/plain": "22081" }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "univ_locations[record_col].nunique()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 62, "outputs": [ { "data": { "text/plain": "22081" }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "affiliations[record_col].nunique()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 63, "outputs": [ { "data": { "text/plain": "99343" }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "univ_locations[\"Institution\"].value_counts().sum()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 64, "outputs": [ { "data": { "text/plain": "130533" }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "affiliations[\"Affiliations\"].value_counts().sum()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "158916 162684\n" ] } ], "source": [ "aff_ = wos.groupby(record_col)[\"Affiliations\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n", "loc_ = wos.groupby(record_col)[\"Addresses\"].apply(lambda x: x.str.split('[')).explode().reset_index().drop(columns=\"level_1\")\n", "print(len(aff_),len(loc_))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "unique_inst = sorted([i.split(\" \") for i in list(affiliations[\"Affiliations\"].unique())], key=len)\n", "# unique_inst = [[''.join(filter(str.isalnum, i)) for i in i_list] for i_list in unique_inst]\n", "unique_inst = [[i.strip(\",\").strip(\"(\").strip(\")\") for i in i_list] for i_list in unique_inst]\n", "unique_inst" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def institution_chunk_norris(text):\n", " for i in unique_inst:\n", " text_split=text.split(\" \")\n", " text_split=[i.strip(\",\").strip(\"(\").strip(\")\") for i in text_split]\n", " overlap = all(token in text_split for token in i)\n", " if overlap:\n", " return (\" \".join(i))\n", " return \"ERROR\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "affiliations[\"Affiliations_merged\"] = affiliations[\"Affiliations\"].apply(lambda x: institution_chunk_norris(x))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "affiliations[\"Affiliations\"].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "affiliations[\"Affiliations_merged\"].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "affiliations[affiliations[\"Affiliations_merged\"]==\"ERROR\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from nltk.metrics import edit_distance\n", "from nltk.metrics import edit_distance_align\n", "#results = df.apply(lambda x: edit_distance(x[\"column1\"], x[\"column2\"]), axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "affiliations = affiliations.merge(univ_locations, on=record_col)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "affiliations[\"Affiliations\"] = affiliations[\"Affiliations\"].str.upper().str.strip()\n", "affiliations[\"Institution\"] = affiliations[\"Institution\"].str.upper().str.strip()\n", "\n", "affiliations[\"levehnstein\"] = affiliations.apply(\n", " lambda x: edit_distance(x[\"Affiliations\"], x[\"Institution\"]), axis=1)\n", "affiliations.head()" ] }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "def tok_overlap(lon_str, short_str):\n", " l,s = lon_str.split(\" \"), short_str.split(\" \")\n", " # create a pairwise distance matrix using NumPy\n", " distance_matrix = np.fromfunction(np.vectorize(lambda i, j: edit_distance(l[int(i)], s[int(j)])), shape=(len(l), len(s)))\n", " distance_frame = pd.DataFrame(data=distance_matrix, columns=s, index=l)\n", "\n", " return min(distance_frame.min().sum(),distance_frame.T.min().sum())\n", "\n", "# lon=(\"UNIVERSITY\",\"AMSTERDAM\",\"TECHNICAL\", \"LOCAL\")\n", "# sho=(\"UNIV\",\"AMSTER\",\"TECH\",\"LOCAL\")\n", "# tok_overlap(lon_str=\" \".join(lon),short_str=\" \".join(sho)).min().sum()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "tok_overlap(lon_str=\" \".join(l),short_str=\" \".join(s)).shape" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "affiliations[\"token_overlap\"] = affiliations.apply(\n", " lambda x: tok_overlap(x[\"Affiliations\"], x[\"Institution\"]), axis=1)\n", "affiliations.head()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "affiliations.sort_values(by=[record_col,\"Affiliations\",\"token_overlap\"], ascending=[False,False,True]).drop_duplicates(subset=[record_col,\"Affiliations\"])" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "helper = affiliations.sort_values(by=[\"Affiliations\",\"token_overlap\"], ascending=[False,True]).drop_duplicates(subset=[record_col,\"Affiliations\"])\n", "afh = helper[[\"Affiliations\",\"Institution\",\"Country\"]]\n", "afh.groupby(\"Affiliations\")[\"Institution\"].agg(pd.Series.mode)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "afh.groupby(\"Affiliations\")[\"Country\"].agg(pd.Series.mode)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "helper1 = affiliations.sort_values(by=[\"Affiliations\",\"token_overlap\"], ascending=[False,True]).drop_duplicates(subset=[record_col,\"Affiliations\"])\n", "afh1 = helper1[[\"Affiliations\",\"Institution\",\"City\",\"Country\",\"Country_Type\"]]\n", "mode1_i = afh1.groupby(\"Affiliations\")[\"Institution\"].apply(pd.Series.mode).reset_index()\n", "mode1_c = afh1.groupby(\"Affiliations\")[\"Country\"].apply(pd.Series.mode).reset_index()\n", "mode1_city = afh1.groupby(\"Affiliations\")[\"City\"].apply(pd.Series.mode).reset_index()\n", "mode1_type = afh1.groupby(\"Affiliations\")[\"Country_Type\"].apply(pd.Series.mode).reset_index()\n", "\n", "helper2 = affiliations.sort_values(by=[\"Affiliations\",\"levehnstein\"], ascending=[False,True]).drop_duplicates(subset=[record_col,\"Affiliations\"])\n", "afh2 = helper2[[\"Affiliations\",\"Institution\",\"City\",\"Country\",\"Country_Type\"]]\n", "mode2_i = afh2.groupby(\"Affiliations\")[\"Institution\"].apply(pd.Series.mode).reset_index()\n", "mode2_c = afh2.groupby(\"Affiliations\")[\"Country\"].apply(pd.Series.mode).reset_index()\n", "mode2_city = afh2.groupby(\"Affiliations\")[\"City\"].apply(pd.Series.mode).reset_index()\n", "mode2_type = afh2.groupby(\"Affiliations\")[\"Country_Type\"].apply(pd.Series.mode).reset_index()\n", "\n", "mode_i = pd.concat([mode1_i,mode2_i],ignore_index=True)[[\"Affiliations\",\"Institution\"]].groupby(\"Affiliations\")[\"Institution\"].agg(\n", " lambda x: pd.Series.mode(x)[0])\n", "mode_c = pd.concat([mode1_c,mode2_c],ignore_index=True)[[\"Affiliations\",\"Country\"]].groupby(\"Affiliations\")[\"Country\"].agg(\n", " lambda x: pd.Series.mode(x)[0])\n", "mode_city = pd.concat([mode1_city,mode2_city],ignore_index=True)[[\"Affiliations\",\"City\"]].groupby(\"Affiliations\")[\"City\"].agg(\n", " lambda x: pd.Series.mode(x)[0])\n", "mode_type = pd.concat([mode1_type,mode2_type],ignore_index=True)[[\"Affiliations\",\"Country_Type\"]].groupby(\"Affiliations\")[\"Country_Type\"].agg(\n", " lambda x: pd.Series.mode(x)[0])" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "from functools import reduce\n", "dfs = [mode_i, mode_c, mode_city, mode_type]\n", "mode_final = reduce(lambda left,right: pd.merge(left,right,on='Affiliations'), dfs)\n", "mode_final = mode_final.reset_index()\n", "mode_final.columns = [\"Affiliations\",\"Institution (short name from address)\",\"Country_candidate\",\"City_candidate\",\"Country_type_candidate\"]\n", "mode_final" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "aff_lookup = affiliations[[\"Affiliations\",\"Institution\",\"levehnstein\"]].drop_duplicates().sort_values(by=[\"Affiliations\",\"levehnstein\"],ascending=[True,True])\n", "aff_lookup" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "aff_lookup_levehnstein = aff_lookup.copy()\n", "aff_lookup_overlap = aff_lookup.copy()\n", "inst_short = sorted([i.split(\" \") for i in list(aff_lookup_overlap[\"Institution\"].unique())], key=len)\n", "inst_short" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "aff_lookup.drop_duplicates(subset=\"Affiliations\")" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "# aff_m = pd.DataFrame(affiliations[\"Affiliations\"].unique(), columns=[\"Affiliations\"])\n", "# inst_m = pd.DataFrame(affiliations[[\"Institution\",\"Country_Type\",\"Country\",\"City\"]].drop_duplicates(),columns=[\"Institution\",\"Country_Type\",\"Country\",\"City\"])\n", "#\n", "# aff_lookup = aff_m.merge(inst_m, how='cross')\n", "#\n", "# # aff_lookup[\"levehnstein\"] = aff_lookup.apply(\n", "# # lambda x: edit_distance(x[\"Affiliations\"], x[\"Institution\"]), axis=1)\n", "#\n", "# aff_lookup.assign(distance=[*map(edit_distance, aff_lookup.Affiliations, aff_lookup.Institution)])" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "affiliations[\"levehnstein\"].plot(kind=\"hist\")" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "affiliations[\"token_overlap\"].plot(kind=\"hist\")" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "affiliations[affiliations[\"Affiliations\"].str.contains(\"A*STAR\",regex=False)]" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "affiliations = affiliations.sort_values(by=[record_col,\"Affiliations\",\"levehnstein\"], ascending=[False,False,True])\n", "affiliations_merge = affiliations.drop_duplicates(subset=[record_col,\"Affiliations\"])\n", "affiliations_merge.head(10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "wos_cat = wos.groupby(record_col)[\"WoS Categories\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n", "wos_cat[\"WoS Categories\"].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "wos_areas = wos.groupby(record_col)[\"Research Areas\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n", "wos_areas[\"Research Areas\"] = wos_areas[\"Research Areas\"].str.strip()\n", "wos_areas[\"Research Areas\"].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "wos_areas[\"Research Areas\"].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "[c for c in wos.columns if \"_English\" in c]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import seaborn as sns\n", "from matplotlib.ticker import MaxNLocator\n", "import math" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "wos = wos[((wos[\"Publication Year\"]<2023) & (~wos['Domain_English'].isna()))]\n", "\n", "metrix_levels = [c for c in wos.columns if \"_English\" in c]\n", "for m in metrix_levels:\n", " wos[m] = wos[m].replace({\"article-level classification\":\"Miscellaneous\"})\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "wos" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "metrix_levels" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "outdir=\"wos_processed_data\"" ] }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "os.makedirs(outdir, exist_ok=True)\n", "\n", "wos.to_excel(f\"{outdir}/wos_processed.xlsx\", index=False)\n", "\n", "locations.drop(columns=\"Addresses\").to_excel(f\"{outdir}/wos_addresses.xlsx\", index=False)\n", "\n", "affiliations_merge.to_excel(f\"{outdir}/wos_affiliations.xlsx\", index=False)\n", "\n", "author_locations.to_excel(f\"{outdir}/wos_author_locations.xlsx\", index=False)\n", "\n", "univ_locations.to_excel(f\"{outdir}/wos_univ_locations.xlsx\", index=False)\n", "mode_final.to_excel(f\"{outdir}/wos_univ_locations_v2.xlsx\", index=False)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "kw_df.to_excel(f\"{outdir}/keywords.xlsx\", index=False)\n", "wos_nlp.to_excel(f\"{outdir}/wos_nlp.xlsx\", index=False)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "# wos_nlp = wos[[record_col,\"Article Title\",\"Abstract\"]]\n", "wos = wos.merge(wos_kwd_concat, on=record_col)\n", "wos[\"Document\"] = wos[\"Article Title\"].str.cat(wos[[\"Abstract\", \"keyword_all\"]].fillna(\"\"), sep=' - ')\n", "# wos_kwd_test[\"BERT_KWDS\"] = wos_kwd_test[\"Document\"].map(kwd_extract)\n", "\n", "vectors = list()\n", "vector_norms = list()\n", "\n", "for doc in nlp.pipe(wos['Document'].astype('unicode').values, batch_size=100,\n", " n_process=4):\n", " vectors.append(doc.vector)\n", " vector_norms.append(doc.vector_norm)\n", "\n", "wos['vector'] = vectors\n", "wos['vector_norm'] = vector_norms\n", "wos['vector_norm'].plot(kind=\"hist\")\n", "from sklearn.manifold import TSNE\n", "import matplotlib.pyplot as plt\n", "% matplotlib inline\n", "\n", "vector_data = pd.DataFrame(wos[\"vector\"].to_list(), index=wos[record_col]).reset_index()\n", "vector_data.head()\n", "\n", "labels = vector_data.values[:, 0]\n", "record_vectors = vector_data.values[:, 1:]\n", "\n", "tsne_model = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, random_state=42, metric='cosine')\n", "tnse_2d = tsne_model.fit_transform(record_vectors)\n", "tnse_data = pd.DataFrame(tnse_2d, index=labels).reset_index()\n", "tnse_data.columns = [record_col, \"TNSE-X\", \"TNSE-Y\"]\n", "tnse_data.head()\n", "import seaborn as sns\n", "\n", "wos_plot = wos.merge(tnse_data, on=record_col)\n", "\n", "g = sns.scatterplot(wos_plot[wos_plot[\"Domain_English\"] != 'article-level classification'], x=\"TNSE-X\", y=\"TNSE-Y\",\n", " hue='Domain_English', s=1)\n", "g.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\n", "wos_plot.head()\n", "wos_nlp = wos_plot[[record_col, \"Document\", \"keyword_all\", \"TNSE-X\", \"TNSE-Y\"]]\n", "g = sns.kdeplot(\n", " data=wos_plot[wos_plot[\"Domain_English\"] != 'article-level classification'],\n", " x=\"TNSE-X\", y=\"TNSE-Y\", hue='Domain_English',\n", " thresh=.1,\n", ")\n", "wos.columns" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Domain" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "group = 'Domain_English'\n", "data = wos.groupby(group, as_index=False)[record_col].nunique().sort_values(ascending=False, by=record_col)\n", "data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sns.barplot(data, x=record_col, y=group)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# group = ['Publication Year','Domain_English']\n", "# data = wos.groupby(group, as_index=False)[record_col].nunique().sort_values(ascending=False, by=group+[record_col])\n", "# data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# group = ['Publication Year','Domain_English']\n", "# data = wos.groupby(group)[record_col].nunique().unstack(fill_value=0).stack().reset_index().rename(columns={0:record_col}).sort_values(ascending=False, by=group+[record_col])\n", "# data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# g=sns.lineplot(data.sort_values(ascending=True, by=group[-1]),y=record_col,x=group[0], hue=group[-1])\n", "# g.set(xticks=list(range(2012,2022+1,2)))\n", "# g.legend(title=None)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Field" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# group = ['Publication Year',\"Domain_English\",'Field_English']\n", "# data = wos.groupby(group, as_index=False)[record_col].nunique().sort_values(ascending=False, by=group+[record_col])\n", "# data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# g = sns.FacetGrid(data, col=\"Domain_English\", col_wrap=3, height=5)\n", "# g.map_dataframe(sns.lineplot,x=group[0],y=record_col,hue=group[-1])\n", "# g.set_titles(col_template=\"{col_name}\")\n", "# g.set(xticks=list(range(2012,2022+1,2)))\n", "# # g.add_legend()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# import matplotlib.pyplot as plt\n", "# for cat in sorted(data[group[-2]].unique()):\n", "# sub_data = data[data[group[-2]]==cat]\n", "# sub_data = sub_data.complete({group[0]:range(int(data[group[0]].min()), int(data[group[0]].max()) + 1)}\n", "# ,group[-1],fill_value=0)\n", "# g=sns.lineplot(sub_data.sort_values(ascending=True, by=group[-1]),y=record_col,x=group[0], hue=group[-1])\n", "# g.set(xticks=list(range(2012,2022+1,2)))\n", "# g.legend(title=None)\n", "# g.set_title(cat)\n", "# g.yaxis.set_major_locator(MaxNLocator(integer=True))\n", "# plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# SubField" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# group = ['Publication Year',\"Domain_English\",'Field_English',\"SubField_English\"]\n", "# data = wos.groupby(group, as_index=False)[record_col].nunique().sort_values(ascending=False, by=group+[record_col])\n", "# data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# import matplotlib.pyplot as plt\n", "# for cat in sorted(data[group[-2]].unique()):\n", "# sub_data = data[data[group[-2]]==cat]\n", "# sub_data = sub_data.complete({group[0]:range(int(data[group[0]].min()), int(data[group[0]].max()) + 1)}\n", "# ,group[-1],fill_value=0)\n", "# g=sns.lineplot(sub_data.sort_values(ascending=True, by=group[-1]),y=record_col,x=group[0], hue=group[-1])\n", "# g.set(xticks=list(range(2012,2022+1,2)))\n", "# g.legend(title=None,bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., ncols=math.ceil(len(g.legend_.texts)/12))\n", "# g.set_title(cat)\n", "# plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16" } }, "nbformat": 4, "nbformat_minor": 1 }