{ "cells": [ { "cell_type": "code", "execution_count": 1, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import os\n", "import shutil\n", "from flashgeotext.geotext import GeoText\n", "import re" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 2, "outputs": [], "source": [ "import hashlib\n", "\n", "def md5hash(s: str):\n", " return hashlib.md5(s.encode('utf-8')).hexdigest()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "record_col=\"UT (Unique WOS ID)\"\n", "outfile = r\"C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\WOS\\wos_extract\\wos_records_concat.csv\"" ] }, { "cell_type": "code", "execution_count": 3, "outputs": [], "source": [], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of initial (valid interval) records: 56196\n", "Number of METRIX filtered records: 49854\n", "Number of unindexed records: 2984\n", "Number of filtered records (dropping duplicates): 49839\n" ] } ], "source": [ "wos = pd.read_csv(outfile, sep=\"\\t\",low_memory=False)\n", "\n", "wos = wos[((wos[\"Publication Year\"]<2023)&(wos[\"Publication Year\"]>2010))].copy()\n", "print(f'Number of initial (valid interval) records: {len(wos)}')\n", "\n", "metrix = pd.read_excel(\"sm_journal_classification.xlsx\", sheet_name=\"Journal_Classification\")\n", "\n", "\n", "metrix = metrix.set_index([c for c in metrix.columns if \"issn\" not in c]).stack().reset_index()\n", "metrix = metrix.rename(columns={'level_6':\"issn_type\", 0:\"issn\"})\n", "metrix[\"issn\"]=metrix[\"issn\"].str.replace(\"-\",\"\").str.lower().str.strip()\n", "\n", "wos[\"issn\"] = wos[\"ISSN\"].str.replace(\"-\",\"\").str.lower().str.strip()\n", "wos[\"eissn\"] = wos[\"eISSN\"].str.replace(\"-\",\"\").str.lower().str.strip()\n", "wos = wos.set_index([c for c in wos.columns if \"issn\" not in c]).stack().reset_index()\n", "wos = wos.rename(columns={'level_71':\"issn_var\", 0:\"issn\"})\n", "\n", "wos_merge = wos.merge(metrix, on=\"issn\", how=\"left\")\n", "\n", "\n", "\n", "wos_indexed = wos_merge[~wos_merge[\"Domain_English\"].isna()]\n", "wos_unindexed = wos_merge[~wos_merge[record_col].isin(wos_indexed[record_col])]\n", "\n", "\n", "wos_unindexed = wos_unindexed.sort_values(by=[\"issn_var\"],ascending=False).drop_duplicates(subset=record_col)\n", "wos = wos_indexed.sort_values(by=[\"issn_var\"],ascending=False).drop_duplicates(subset=record_col)\n", "\n", "wos_postmerge = wos.copy()\n", "print(f'Number of METRIX filtered records: {len(wos)}')\n", "print(f'Number of unindexed records: {len(wos_unindexed)}')\n", "\n", "# drop entries not indexed by metrix\n", "# drop duplicates (based on doi)\n", "wos = wos[~((~wos[\"DOI\"].isna())&(wos[\"DOI\"].duplicated(False)))]\n", "wos = wos.drop_duplicates(subset=[\"Publication Type\",\"Document Type\",\"Authors\",\"Article Title\",\"Source Title\",\"Publication Year\"])\n", "print(f'Number of filtered records (dropping duplicates): {len(wos)}')" ] }, { "cell_type": "code", "execution_count": 5, "outputs": [ { "data": { "text/plain": "Domain_English\nApplied Sciences 31871\nNatural Sciences 9542\nHealth Sciences 5942\nEconomic & Social Sciences 1468\narticle-level classification 940\nArts & Humanities 76\nName: count, dtype: int64" }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wos[\"Domain_English\"].value_counts()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 6, "outputs": [], "source": [ "wos_classifier = wos[[\"WoS Categories\",\"Research Areas\"]+list(metrix.columns)].copy().drop_duplicates()\n", "wos_classifier = wos_classifier.groupby([\"WoS Categories\",\"Research Areas\"], as_index=False)[[\"Domain_English\",\"Field_English\",\"SubField_English\"]].agg(\n", " lambda x: pd.Series.mode(x)[0])" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 7, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Found: 2065 \n", "Lost forever: 919\n" ] } ], "source": [ "wos_to_reindex = wos_unindexed.drop(columns=list(metrix.columns))\n", "wos_found = wos_to_reindex.merge(wos_classifier, on=[\"WoS Categories\",\"Research Areas\"], how=\"inner\")\n", "# wos_found = wos_to_reindex.merge(wos_classifier, on=\"Research Areas\", how=\"inner\")\n", "# # wos_found = wos_to_reindex.merge(wos_classifier, on=\"WoS Categories\", how=\"inner\")\n", "wos_stillost = wos_unindexed[~wos_unindexed[record_col].isin(wos_found[record_col])]\n", "\n", "print(\"Found:\", wos_found[record_col].nunique(),\"\\nLost forever:\", wos_stillost[record_col].nunique())" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 8, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of records (after remerge): 51904\n" ] } ], "source": [ "wos = pd.concat([wos,wos_found], ignore_index=True)\n", "print(f'Number of records (after remerge): {len(wos)}')" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 9, "outputs": [ { "data": { "text/plain": "Domain_English\nApplied Sciences 33720\nNatural Sciences 9617\nHealth Sciences 6002\nEconomic & Social Sciences 1533\narticle-level classification 955\nArts & Humanities 77\nName: count, dtype: int64" }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wos[\"Domain_English\"].value_counts()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 10, "outputs": [ { "data": { "text/plain": "WoS Categories\nEngineering, Electrical & Electronic 13661\nComputer Science, Artificial Intelligence 7760\nComputer Science, Information Systems 6481\nTelecommunications 5560\nComputer Science, Theory & Methods 3597\n ... \nMusic 1\nCultural Studies 1\nPsychology, Psychoanalysis 1\nAsian Studies 1\nAndrology 1\nName: count, Length: 236, dtype: int64" }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wos_cat = wos.groupby(record_col)[\"WoS Categories\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n", "wos_cat[\"WoS Categories\"] = wos_cat[\"WoS Categories\"].str.strip()\n", "wos_cat[\"WoS Categories\"].value_counts()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 11, "outputs": [ { "data": { "text/plain": "WoS Category\nEngineering 20126\nComputer Science 17613\nTelecommunications 5560\nImaging Science & Photographic Technology 3295\nAutomation & Control Systems 3232\n ... \nMusic 1\nAndrology 1\nLiterature 1\nCultural Studies 1\nAsian Studies 1\nName: count, Length: 177, dtype: int64" }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wos_subcat = wos_cat.copy()\n", "wos_subcat[['WoS Category', 'WoS SubCategory']] = wos_subcat[\"WoS Categories\"].str.split(\",\", expand = True, n=1)\n", "for c in ['WoS Category', 'WoS SubCategory',\"WoS Categories\"]:\n", " wos_subcat[c] = wos_subcat[c].str.strip()\n", "wos_subcat.drop_duplicates(subset=[record_col,'WoS Category'])[\"WoS Category\"].value_counts()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 12, "outputs": [ { "data": { "text/plain": "Research Areas\nEngineering 20176\nComputer Science 17613\nTelecommunications 5560\nEnvironmental Sciences & Ecology 3732\nImaging Science & Photographic Technology 3295\n ... \nLiterature 1\nWomen's Studies 1\nCultural Studies 1\nAsian Studies 1\nMusic 1\nName: count, Length: 147, dtype: int64" }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wos_areas = wos.groupby(record_col)[\"Research Areas\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n", "wos_areas[\"Research Areas\"] = wos_areas[\"Research Areas\"].str.strip()\n", "wos_areas[\"Research Areas\"].value_counts()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 12, "outputs": [], "source": [], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 13, "outputs": [ { "data": { "text/plain": " Article Title \n24862 Kinematic self-calibration of non-contact five... \\\n6623 Optimizing Color Assignment for Perception of ... \n20728 CFD modeling of biomass combustion and gasific... \n41245 Redshift-space distortions in f(R) gravity \n12373 Executable Knowledge Graphs for Machine Learni... \n... ... \n11117 Biochar amendment mitigated N2O emissions from... \n47975 Adaptive Noise Reduction for Sound Event Detec... \n4599 NVM Storage in IoT Devices: Opportunities and ... \n40609 FABNet: Fusion Attention Block and Transfer Le... \n45199 Tea Category Identification Using a Novel Frac... \n\n Keywords Plus \n24862 POSE MEASUREMENT; PARALLEL; MANIPULATOR \\\n6623 OPTIMIZATION; DIFFERENCE \n20728 DISCRETE PARTICLE SIMULATION; STEAM GASIFICATI... \n41245 DARK ENERGY SURVEY; REAL-SPACE; GROWTH; DENSIT... \n12373 NaN \n... ... \n11117 NITROUS-OXIDE EMISSIONS; NITRIFIER DENITRIFICA... \n47975 NONNEGATIVE MATRIX FACTORIZATION; SOURCE SEPAR... \n4599 ENCRYPTED DATA; ACCESS-CONTROL; INTERNET; MEMO... \n40609 NUCLEI \n45199 LEARNING-BASED OPTIMIZATION; PATHOLOGICAL BRAI... \n\n Author Keywords \n24862 kinematic self-calibration; five-axis measurin... \n6623 Color perception; visual design; scatterplots \n20728 Biomass combustion and gasification; CFD simul... \n41245 cosmology: theory; dark energy; large-scale st... \n12373 Knowledge graph; Machine learning; Data analyt... \n... ... \n11117 Biochar; Nitrite accumulation; Nitrous oxide; ... \n47975 sound event detection; non-stationary noise; w... \n4599 IoT; NVM; storage system; energy efficiency; s... \n40609 Cancer; Analytical models; Transfer learning; ... \n45199 tea-category identification; fractional Fourie... \n\n[100 rows x 3 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Article TitleKeywords PlusAuthor Keywords
24862Kinematic self-calibration of non-contact five...POSE MEASUREMENT; PARALLEL; MANIPULATORkinematic self-calibration; five-axis measurin...
6623Optimizing Color Assignment for Perception of ...OPTIMIZATION; DIFFERENCEColor perception; visual design; scatterplots
20728CFD modeling of biomass combustion and gasific...DISCRETE PARTICLE SIMULATION; STEAM GASIFICATI...Biomass combustion and gasification; CFD simul...
41245Redshift-space distortions in f(R) gravityDARK ENERGY SURVEY; REAL-SPACE; GROWTH; DENSIT...cosmology: theory; dark energy; large-scale st...
12373Executable Knowledge Graphs for Machine Learni...NaNKnowledge graph; Machine learning; Data analyt...
............
11117Biochar amendment mitigated N2O emissions from...NITROUS-OXIDE EMISSIONS; NITRIFIER DENITRIFICA...Biochar; Nitrite accumulation; Nitrous oxide; ...
47975Adaptive Noise Reduction for Sound Event Detec...NONNEGATIVE MATRIX FACTORIZATION; SOURCE SEPAR...sound event detection; non-stationary noise; w...
4599NVM Storage in IoT Devices: Opportunities and ...ENCRYPTED DATA; ACCESS-CONTROL; INTERNET; MEMO...IoT; NVM; storage system; energy efficiency; s...
40609FABNet: Fusion Attention Block and Transfer Le...NUCLEICancer; Analytical models; Transfer learning; ...
45199Tea Category Identification Using a Novel Frac...LEARNING-BASED OPTIMIZATION; PATHOLOGICAL BRAI...tea-category identification; fractional Fourie...
\n

100 rows × 3 columns

\n
" }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wos[[\"Article Title\",\"Keywords Plus\",\"Author Keywords\"]].sample(100)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 14, "outputs": [ { "data": { "text/plain": " UT (Unique WOS ID) keyword_all\n0 WOS:000208837000001 NANOINDENTATION\n1 WOS:000208837000001 HARDNESS\n2 WOS:000208837000001 PLASMA-SPRAYED COATING\n3 WOS:000208837000001 INVERSE ANALYSIS\n4 WOS:000208837000001 NUMERICAL METHOD\n.. ... ...\n97 WOS:000209571700012 PERSONALIZED MEDICINE\n98 WOS:000209571700012 COMPLEX NETWORK\n99 WOS:000209571700012 CLINICAL PHENOTYPE NETWORK\n100 WOS:000209571700012 TRADITIONAL CHINESE MEDICINE\n101 WOS:000209617200002 PHYLLOSCOPIDAE\n\n[100 rows x 2 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
UT (Unique WOS ID)keyword_all
0WOS:000208837000001NANOINDENTATION
1WOS:000208837000001HARDNESS
2WOS:000208837000001PLASMA-SPRAYED COATING
3WOS:000208837000001INVERSE ANALYSIS
4WOS:000208837000001NUMERICAL METHOD
.........
97WOS:000209571700012PERSONALIZED MEDICINE
98WOS:000209571700012COMPLEX NETWORK
99WOS:000209571700012CLINICAL PHENOTYPE NETWORK
100WOS:000209571700012TRADITIONAL CHINESE MEDICINE
101WOS:000209617200002PHYLLOSCOPIDAE
\n

100 rows × 2 columns

\n
" }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kw_df = pd.DataFrame()\n", "for c in [\"Keywords Plus\",\"Author Keywords\"]:\n", " kwp = wos.groupby(record_col)[c].apply(lambda x: x.str.split(';')).explode().str.strip().str.upper()\n", " kwp.name = 'keyword_all'\n", " kw_df = pd.concat([kwp.reset_index(),kw_df],ignore_index=True)\n", "kw_df = kw_df[~kw_df[\"keyword_all\"].isna()].copy().drop(columns=\"level_1\").drop_duplicates()\n", "kw_df[\"keyword_all\"] = kw_df[\"keyword_all\"].apply(lambda x: re.sub(\"[\\(\\[].*?[\\)\\]]\", \"\", x))\n", "kw_df.head(100)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 15, "outputs": [ { "data": { "text/plain": " UT (Unique WOS ID) keyword_all\n0 WOS:000208837000001 NANOINDENTATION; HARDNESS; PLASMA-SPRAYED COAT...\n1 WOS:000208863600013 COMPARATIVE GENOMICS; ANAMMOX; KUENENIA STUTTG...\n2 WOS:000208863600266 ANME; PYROSEQUENCING; AOM; COMMUNITY STRUCTURE...\n3 WOS:000208863900217 DEFAULT MODE NETWORK; EFFECTIVE CONNECTIVITY; ...\n4 WOS:000208935500007 ESTIMATION; SOURCE TERM; QUASI STEADY; THE ITE...", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
UT (Unique WOS ID)keyword_all
0WOS:000208837000001NANOINDENTATION; HARDNESS; PLASMA-SPRAYED COAT...
1WOS:000208863600013COMPARATIVE GENOMICS; ANAMMOX; KUENENIA STUTTG...
2WOS:000208863600266ANME; PYROSEQUENCING; AOM; COMMUNITY STRUCTURE...
3WOS:000208863900217DEFAULT MODE NETWORK; EFFECTIVE CONNECTIVITY; ...
4WOS:000208935500007ESTIMATION; SOURCE TERM; QUASI STEADY; THE ITE...
\n
" }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wos_kwd_concat = kw_df.groupby(record_col, as_index=False).agg({'keyword_all': '; '.join})\n", "wos_kwd_concat.head()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 15, "outputs": [], "source": [], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 16, "outputs": [ { "data": { "text/plain": "Index(['Publication Type', 'Authors', 'Book Authors', 'Book Editors',\n 'Book Group Authors', 'Author Full Names', 'Book Author Full Names',\n 'Group Authors', 'Article Title', 'Source Title', 'Book Series Title',\n 'Book Series Subtitle', 'Language', 'Document Type', 'Conference Title',\n 'Conference Date', 'Conference Location', 'Conference Sponsor',\n 'Conference Host', 'Author Keywords', 'Keywords Plus', 'Abstract',\n 'Addresses', 'Affiliations', 'Reprint Addresses', 'Email Addresses',\n 'Researcher Ids', 'ORCIDs', 'Funding Orgs', 'Funding Name Preferred',\n 'Funding Text', 'Cited References', 'Cited Reference Count',\n 'Times Cited, WoS Core', 'Times Cited, All Databases',\n '180 Day Usage Count', 'Since 2013 Usage Count', 'Publisher',\n 'Publisher City', 'Publisher Address', 'ISSN', 'eISSN', 'ISBN',\n 'Journal Abbreviation', 'Journal ISO Abbreviation', 'Publication Date',\n 'Publication Year', 'Volume', 'Issue', 'Part Number', 'Supplement',\n 'Special Issue', 'Meeting Abstract', 'Start Page', 'End Page',\n 'Article Number', 'DOI', 'DOI Link', 'Book DOI', 'Early Access Date',\n 'Number of Pages', 'WoS Categories', 'Web of Science Index',\n 'Research Areas', 'IDS Number', 'Pubmed Id', 'Open Access Designations',\n 'Highly Cited Status', 'Hot Paper Status', 'Date of Export',\n 'UT (Unique WOS ID)', 'issn_var', 'issn', 'Domain_English',\n 'Field_English', 'SubField_English', '2.00 SEQ', 'Source_title',\n 'srcid', 'issn_type'],\n dtype='object')" }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wos.columns" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "geotext = GeoText()\n", "\n", "def extract_location(input_text, key='countries'):\n", " anomalies = {\"Malta\":\"Malta\",\n", " \"Mongolia\":\"Mongolia\",\n", " \"Quatar\":\"Qatar\",\n", " \"Qatar\":\"Qatar\",\n", " \"Ethiop\":\"Ethiopia\",\n", " \"Nigeria\":\"Nigeria\",\n", " \"BELAR\":\"Belarus\",\n", " \"Venezuela\":\"Venezuela\",\n", " \"Cyprus\":\"Cyprus\",\n", " \"Ecuador\":\"Ecuador\",\n", " \"U Arab\":\"United Arab Emirates\",\n", " \"Syria\":\"Syria\",\n", " \"Uganda\":\"Uganda\",\n", " \"Yemen\":\"Yemen\",\n", " \"Mali\":\"Mali\",\n", " \"Senegal\":\"Senegal\",\n", " \"Vatican\":\"Vatican\",\n", " \"Uruguay\":\"Uruguay\",\n", " \"Panama\":\"Panama\",\n", " \"Fiji\":\"Fiji\",\n", " \"Faroe\":\"Faroe Islands\",\n", " \"Macedonia\":\"Macedonia\",\n", " 'Mozambique':'Mozambique',\n", " \"Kuwait\":\"Kuwait\",\n", " \"Libya\":\"Libya\",\n", " \"Turkiy\":\"Turkey\",\n", " \"Liberia\":\"Liberia\",\n", " \"Namibia\":\"Namibia\",\n", " \"Ivoire\":\"Ivory Coast\",\n", " \"Guatemala\":\"Gutemala\",\n", " \"Paraguay\":\"Paraguay\",\n", " \"Honduras\":\"Honduras\",\n", " \"Nicaragua\":\"Nicaragua\",\n", " \"Trinidad\":\"Trinidad & Tobago\",\n", " \"Liechtenstein\":\"Liechtenstein\",\n", " \"Greenland\":\"Denmark\"}\n", "\n", " extracted = geotext.extract(input_text=input_text)\n", " found = extracted[key].keys()\n", " if len(sorted(found))>0:\n", " return sorted(found)[0]\n", " elif key=='countries':\n", " for i in ['Scotland','Wales','England', 'N Ireland']:\n", " if i in input_text:\n", " return 'United Kingdom'\n", " for j in anomalies.keys():\n", " if j in input_text:\n", " return anomalies.get(j)\n", " else:\n", " return None\n", "\n", "with open('../eu_members.txt',\"r\") as f:\n", " eu_countries=f.readline().split(\",\")\n", " eu_countries=[i.strip() for i in eu_countries]\n", "\n", "def country_cleanup(country):\n", " if \"USA\" in country:\n", " return \"USA\"\n", " elif \"China\" in country:\n", " return \"China\"\n", " elif country in [\"England\", \"Northern Ireland\", \"Wales\", \"Scotland\",\"N Ireland\"]:\n", " return \"United Kingdom\"\n", " else:\n", " return country\n", "\n", "\n", "def country_type(country):\n", " if country in eu_countries:\n", " return \"EU\"\n", " elif country==\"China\":\n", " return \"China\"\n", " elif country in [\"Switzerland\", 'Norway','United Kingdom']:\n", " return \"Non-EU associate\"\n", " else:\n", " return \"Other\"\n" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "locations = wos.groupby(record_col)[\"Addresses\"].apply(lambda x: x.str.split('[')).explode().reset_index().drop(columns=\"level_1\")\n", "\n", "\n", "locations = locations[locations[\"Addresses\"]!=\"\"].copy()\n", "locations[\"Address\"] = locations[\"Addresses\"].apply(lambda x:x.split(\"]\")[-1])\n", "locations[\"Authors_of_address\"] = locations[\"Addresses\"].apply(lambda x:x.split(\"]\")[0])" ] }, { "cell_type": "code", "execution_count": 19, "outputs": [ { "data": { "text/plain": "312820" }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(locations)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 20, "outputs": [ { "data": { "text/plain": " UT (Unique WOS ID) Authors_of_address \n0 WOS:000208837000001 Gitzhofer, Francois \\\n1 WOS:000208837000001 Guo, Wei-Chao; Papeleux, Luc; Ponthot, Jean-Ph... \n2 WOS:000208837000001 Guo, Wei-Chao; Zhang, Wei-Hong \n3 WOS:000208837000001 Rauchs, Gast \n4 WOS:000208863600013 Hu, Baolan \n.. ... ... \n95 WOS:000209546000001 Salahuddin, Nawal \n96 WOS:000209546000001 Shrestha, Babu Raja \n97 WOS:000209546000001 Tan, Cheng Cheng \n98 WOS:000209546000001 Tang, Yao-Qing \n99 WOS:000209546000001 Tu, Mei-Lien \n\n Address \n0 Univ Sherbrooke, Dept Chem Engn, Plasma Techno... \n1 Univ Liege, Aerosp & Mech Engn Dept, LTAS MN2L... \n2 Northwestern Polytech Univ, Key Lab Contempora... \n3 Ctr Rech Publ Henri Tudor, Dept Adv Mat & Stru... \n4 Zhejiang Univ, Dept Environm Engn, Hangzhou 31... \n.. ... \n95 Aga Khan Univ & Hosp, Dept Med, Pulm & Crit Ca... \n96 Kathmandu Med Coll Teaching Hosp, Dept Anesthe... \n97 Sultanah Aminah Hosp, Dept Anaesthesia & Inten... \n98 Shanghai Jiao Tong Univ, Sch Med, Ruijin Hosp,... \n99 Chang Gung Mem Hosp, Kaohsiung Med Ctr, Dept R... \n\n[100 rows x 3 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
UT (Unique WOS ID)Authors_of_addressAddress
0WOS:000208837000001Gitzhofer, FrancoisUniv Sherbrooke, Dept Chem Engn, Plasma Techno...
1WOS:000208837000001Guo, Wei-Chao; Papeleux, Luc; Ponthot, Jean-Ph...Univ Liege, Aerosp & Mech Engn Dept, LTAS MN2L...
2WOS:000208837000001Guo, Wei-Chao; Zhang, Wei-HongNorthwestern Polytech Univ, Key Lab Contempora...
3WOS:000208837000001Rauchs, GastCtr Rech Publ Henri Tudor, Dept Adv Mat & Stru...
4WOS:000208863600013Hu, BaolanZhejiang Univ, Dept Environm Engn, Hangzhou 31...
............
95WOS:000209546000001Salahuddin, NawalAga Khan Univ & Hosp, Dept Med, Pulm & Crit Ca...
96WOS:000209546000001Shrestha, Babu RajaKathmandu Med Coll Teaching Hosp, Dept Anesthe...
97WOS:000209546000001Tan, Cheng ChengSultanah Aminah Hosp, Dept Anaesthesia & Inten...
98WOS:000209546000001Tang, Yao-QingShanghai Jiao Tong Univ, Sch Med, Ruijin Hosp,...
99WOS:000209546000001Tu, Mei-LienChang Gung Mem Hosp, Kaohsiung Med Ctr, Dept R...
\n

100 rows × 3 columns

\n
" }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "locations[\"Address\"] = locations[\"Address\"].str.strip().str.strip(\";\")\n", "locations = locations.groupby([record_col,\"Authors_of_address\"])[\"Address\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_2\")\n", "locations.head(100)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 21, "outputs": [], "source": [ "# import dask.dataframe as dd\n", "#\n", "# locations_ddf = dd.from_pandas(locations, npartitions=4) # convert pandas DataFrame to Dask DataFrame\n", "# loc_compute = locations_ddf.groupby([record_col,\"Authors_of_address\"])[\"Address\"].apply(lambda x: x.str.split(';')).explode().compute() # compute the result" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 22, "outputs": [], "source": [ "# locations_test = locations.head(1000)\n", "# locations_test = locations_test.groupby([record_col,\"Authors_of_address\"])[\"Address\"].str.split(';').explode()\n", "# locations_test" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 23, "outputs": [], "source": [ "\n", "# locations[\"Country\"]=locations['Address'].apply(lambda x: extract_location(input_text=x, key='countries'))\n", "locations[\"Country\"]=locations['Address'].apply(lambda x: x.split(\",\")[-1].strip(\" \").strip(\";\").strip(\" \"))\n", "locations[\"Country\"]=locations['Country'].apply(lambda x: country_cleanup(x))\n", "locations[\"City\"]=locations['Address'].apply(lambda x: extract_location(input_text=x, key='cities'))\n", "locations[\"Country_Type\"] = locations[\"Country\"].apply(lambda x: country_type(x))" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 23, "outputs": [], "source": [], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 24, "outputs": [], "source": [ "scope_types = [\"EU\",\"China\",\"Non-EU associate\"]\n", "locations=locations[locations[\"Country_Type\"].isin(scope_types)]" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": " UT (Unique WOS ID) Address \n1 WOS:000208837000001 Univ Liege, Aerosp & Mech Engn Dept, LTAS MN2L... \\\n2 WOS:000208837000001 Northwestern Polytech Univ, Key Lab Contempora... \n3 WOS:000208837000001 Ctr Rech Publ Henri Tudor, Dept Adv Mat & Stru... \n4 WOS:000208863600013 Zhejiang Univ, Dept Environm Engn, Hangzhou 31... \n5 WOS:000208863600013 Delft Univ Technol, Dept Biotechnol, Delft, Ne... \n\n Country City Country_Type Institution \n1 Belgium Liège EU Univ Liege \n2 China Xi’an China Northwestern Polytech Univ \n3 Luxembourg Luxembourg EU Ctr Rech Publ Henri Tudor \n4 China Hangzhou China Zhejiang Univ \n5 Netherlands Delft EU Delft Univ Technol ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
UT (Unique WOS ID)AddressCountryCityCountry_TypeInstitution
1WOS:000208837000001Univ Liege, Aerosp & Mech Engn Dept, LTAS MN2L...BelgiumLiègeEUUniv Liege
2WOS:000208837000001Northwestern Polytech Univ, Key Lab Contempora...ChinaXi’anChinaNorthwestern Polytech Univ
3WOS:000208837000001Ctr Rech Publ Henri Tudor, Dept Adv Mat & Stru...LuxembourgLuxembourgEUCtr Rech Publ Henri Tudor
4WOS:000208863600013Zhejiang Univ, Dept Environm Engn, Hangzhou 31...ChinaHangzhouChinaZhejiang Univ
5WOS:000208863600013Delft Univ Technol, Dept Biotechnol, Delft, Ne...NetherlandsDelftEUDelft Univ Technol
\n
" }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "univ_locations = locations[[record_col,\"Address\",\"Country\",\"City\",\"Country_Type\"]].copy()\n", "univ_locations[\"Institution\"] = univ_locations[\"Address\"].apply(lambda x: x.split(\",\")[0])\n", "univ_locations = univ_locations.drop_duplicates()\n", "univ_locations.head()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": " UT (Unique WOS ID) Country Country_Type author_str_id\n0 WOS:000208837000001 Belgium EU 6079964a4094c607358a130e41e89f90\n1 WOS:000208837000001 Belgium EU 2321037fa90ac94a23b88a79f1c7f454\n2 WOS:000208837000001 Belgium EU 8a1bfa1e7bc52d323f0d9c23a9b74ed3\n3 WOS:000208837000001 China China 6079964a4094c607358a130e41e89f90\n4 WOS:000208837000001 China China 17fb036de6a4db3ba39ccab3d8307c04", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
UT (Unique WOS ID)CountryCountry_Typeauthor_str_id
0WOS:000208837000001BelgiumEU6079964a4094c607358a130e41e89f90
1WOS:000208837000001BelgiumEU2321037fa90ac94a23b88a79f1c7f454
2WOS:000208837000001BelgiumEU8a1bfa1e7bc52d323f0d9c23a9b74ed3
3WOS:000208837000001ChinaChina6079964a4094c607358a130e41e89f90
4WOS:000208837000001ChinaChina17fb036de6a4db3ba39ccab3d8307c04
\n
" }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "author_locations = locations.groupby([record_col,\"Country\",\"Country_Type\"])[\"Authors_of_address\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_3\")\n", "author_locations[\"Author_name\"] = author_locations[\"Authors_of_address\"].str.strip()\n", "author_locations = author_locations.drop(columns=\"Authors_of_address\")\n", "author_locations[\"author_str_id\"] = author_locations[\"Author_name\"].apply(lambda x:''.join(filter(str.isalnum, x.lower())))\n", "author_locations[\"author_str_id\"] = author_locations[\"author_str_id\"].apply(md5hash)\n", "author_locations = author_locations.drop(columns=\"Author_name\")\n", "author_locations.head()" ] }, { "cell_type": "code", "execution_count": 27, "outputs": [ { "data": { "text/plain": " UT (Unique WOS ID) Country Country_Type \n0 WOS:000208837000001 Belgium EU \\\n3 WOS:000208837000001 China China \n4 WOS:000208837000001 China China \n6 WOS:000208863600013 China China \n7 WOS:000208863600013 Netherlands EU \n... ... ... ... \n643323 WOS:000964683900016 Italy EU \n643324 WOS:000964683900016 Italy EU \n643325 WOS:000967389100001 China China \n643326 WOS:000967389100001 Norway Non-EU associate \n643327 WOS:000967389100001 Norway Non-EU associate \n\n author_str_id \n0 6079964a4094c607358a130e41e89f90 \n3 6079964a4094c607358a130e41e89f90 \n4 17fb036de6a4db3ba39ccab3d8307c04 \n6 54c7bc6fe9b77434ca1bf04d763d843b \n7 df81f9da6c8f5c968c16ef0aab1bb8f9 \n... ... \n643323 3c631398a81ab7058d95a0c6418a2c0b \n643324 3c631398a81ab7058d95a0c6418a2c0b \n643325 ce65541a6c334225a9617439f4a95012 \n643326 7c52a53f8d79b1ffd4f2e4cde9548e1d \n643327 7c52a53f8d79b1ffd4f2e4cde9548e1d \n\n[573569 rows x 4 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
UT (Unique WOS ID)CountryCountry_Typeauthor_str_id
0WOS:000208837000001BelgiumEU6079964a4094c607358a130e41e89f90
3WOS:000208837000001ChinaChina6079964a4094c607358a130e41e89f90
4WOS:000208837000001ChinaChina17fb036de6a4db3ba39ccab3d8307c04
6WOS:000208863600013ChinaChina54c7bc6fe9b77434ca1bf04d763d843b
7WOS:000208863600013NetherlandsEUdf81f9da6c8f5c968c16ef0aab1bb8f9
...............
643323WOS:000964683900016ItalyEU3c631398a81ab7058d95a0c6418a2c0b
643324WOS:000964683900016ItalyEU3c631398a81ab7058d95a0c6418a2c0b
643325WOS:000967389100001ChinaChinace65541a6c334225a9617439f4a95012
643326WOS:000967389100001NorwayNon-EU associate7c52a53f8d79b1ffd4f2e4cde9548e1d
643327WOS:000967389100001NorwayNon-EU associate7c52a53f8d79b1ffd4f2e4cde9548e1d
\n

573569 rows × 4 columns

\n
" }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "author_locations[author_locations['author_str_id'].duplicated(False)]" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "author_primary_region = author_locations.sort_values(by=\"Country_Type\").drop_duplicates(subset=[record_col,\"author_str_id\"])\n", "# author_primary_region\n", "\n", "china=author_primary_region[author_primary_region[\"Country_Type\"]==\"China\"][record_col].unique()\n", "eu=author_primary_region[author_primary_region[\"Country_Type\"]==\"EU\"][record_col].unique()\n", "assoc=author_primary_region[author_primary_region[\"Country_Type\"]==\"Non-EU associate\"][record_col].unique()\n", "\n", "\n", "# records that have distinct authors with different country affiliations\n", "valid_scope = wos[((wos[record_col].isin(china))\n", " &\n", " ((wos[record_col].isin(eu))\n", " |\n", " (wos[record_col].isin(assoc))))][record_col].unique()" ] }, { "cell_type": "code", "execution_count": 29, "outputs": [ { "data": { "text/plain": " UT (Unique WOS ID) Country Country_Type \n537692 WOS:000732204600001 China China \\\n204027 WOS:000414089800001 China China \n204028 WOS:000414089800001 China China \n204029 WOS:000414089800001 China China \n204030 WOS:000414090800001 China China \n\n author_str_id \n537692 8fe31cbbd07c639aa4d779688896be81 \n204027 67c7beb18fafd77f1319739fa683bc5e \n204028 7269f0a31fc620688aae12aad9e3cd85 \n204029 ac28aea698a527fb5195d3d24189ea04 \n204030 6c91bf481b6bddc1426d12a18823224a ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
UT (Unique WOS ID)CountryCountry_Typeauthor_str_id
537692WOS:000732204600001ChinaChina8fe31cbbd07c639aa4d779688896be81
204027WOS:000414089800001ChinaChina67c7beb18fafd77f1319739fa683bc5e
204028WOS:000414089800001ChinaChina7269f0a31fc620688aae12aad9e3cd85
204029WOS:000414089800001ChinaChinaac28aea698a527fb5195d3d24189ea04
204030WOS:000414090800001ChinaChina6c91bf481b6bddc1426d12a18823224a
\n
" }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "author_primary_region.head()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of records: 51904\n", "Number of valid cooperation records: 46060\n" ] } ], "source": [ "print(f'Number of records: {len(wos)}')\n", "print(f'Number of valid cooperation records: {len(valid_scope)}')" ] }, { "cell_type": "code", "execution_count": 31, "outputs": [], "source": [ "wos = wos[wos[record_col].isin(valid_scope)]\n", "locations = locations[locations[record_col].isin(valid_scope)]\n", "univ_locations = univ_locations[univ_locations[record_col].isin(valid_scope)]\n", "author_locations = author_locations[author_locations[record_col].isin(valid_scope)]\n", "author_primary_region = author_locations[author_locations[record_col].isin(valid_scope)]" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "affiliations = wos.groupby(record_col)[\"Affiliations\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n", "affiliations[\"Affiliations\"] = affiliations[\"Affiliations\"].str.strip().str.upper().fillna(\"UNKNOWN\")\n", "affiliations = affiliations.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 33, "outputs": [ { "data": { "text/plain": "Affiliations\nCHINESE ACADEMY OF SCIENCES 5616\nUNIVERSITY OF LONDON 2604\nUDICE-FRENCH RESEARCH UNIVERSITIES 2240\nCENTRE NATIONAL DE LA RECHERCHE SCIENTIFIQUE (CNRS) 2170\nTSINGHUA UNIVERSITY 1935\n ... \nUNIVERSITY OF FUKUI 1\nPONTIFICIA UNIVERSIDADE CATOLICA DE GOIAS 1\nINSTITUTE OF ORGANIC CHEMISTRY & BIOCHEMISTRY OF THE CZECH ACADEMY OF SCIENCES 1\nUNIVERSITAS PELITA HARAPAN 1\nFRANCISCUS GASTHUIS 1\nName: count, Length: 7609, dtype: int64" }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "affiliations[\"Affiliations\"].value_counts()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 34, "outputs": [ { "data": { "text/plain": "Institution\nChinese Acad Sci 5749\nTsinghua Univ 2315\nShanghai Jiao Tong Univ 1976\nZhejiang Univ 1806\nPeking Univ 1661\n ... \nNatl Technol Inst Mental Disorders 1\nSeinajoki Univ Appl Sci 1\nJD Intelligent City Res 1\nCAS Ctr Excellence Planetol 1\nKey Lab Intelligent Prevent Med Zhejiang Prov 1\nName: count, Length: 19821, dtype: int64" }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "univ_locations[\"Institution\"].value_counts()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 35, "outputs": [ { "data": { "text/plain": "46060" }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "univ_locations[record_col].nunique()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 36, "outputs": [ { "data": { "text/plain": "46060" }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "affiliations[record_col].nunique()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 37, "outputs": [ { "data": { "text/plain": "202790" }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "univ_locations[\"Institution\"].value_counts().sum()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 38, "outputs": [ { "data": { "text/plain": "268471" }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "affiliations[\"Affiliations\"].value_counts().sum()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": "WoS Categories\n Engineering, Electrical & Electronic 8303\nComputer Science, Artificial Intelligence 6115\n Telecommunications 4661\nComputer Science, Information Systems 4584\nEngineering, Electrical & Electronic 4036\n ... \nCultural Studies 1\n Ornithology 1\n Criminology & Penology 1\nArt 1\n Psychology, Developmental 1\nName: count, Length: 425, dtype: int64" }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wos_cat = wos.groupby(record_col)[\"WoS Categories\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n", "wos_cat[\"WoS Categories\"].value_counts()" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/plain": "Research Areas\nEngineering 18098\nComputer Science 15658\nTelecommunications 5046\nEnvironmental Sciences & Ecology 3246\nImaging Science & Photographic Technology 2947\n ... \nFilm, Radio & Television 2\nArea Studies 2\nCultural Studies 1\nAsian Studies 1\nMusic 1\nName: count, Length: 145, dtype: int64" }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wos_areas = wos.groupby(record_col)[\"Research Areas\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n", "wos_areas[\"Research Areas\"] = wos_areas[\"Research Areas\"].str.strip()\n", "wos_areas[\"Research Areas\"].value_counts()" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": "['Domain_English', 'Field_English', 'SubField_English']" }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[c for c in wos.columns if \"_English\" in c]" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "metrix_levels = [c for c in wos.columns if \"_English\" in c]\n", "for m in metrix_levels:\n", " wos[m] = wos[m].replace({\"article-level classification\":\"Multidisciplinary\"})\n" ] }, { "cell_type": "code", "execution_count": 42, "outputs": [], "source": [], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/plain": " Publication Type Authors \n0 J Yan, Z; Jing, XY; Pedrycz, W \\\n1 J Sookhak, M; Yu, FR; He, Y; Talebian, H; Safa, ... \n2 J Ning, ZL; Dong, PR; Wang, XJ; Guo, L; Rodrigue... \n3 J Wang, XD; Garg, S; Lin, H; Kaddoum, G; Hu, J; ... \n4 J Lu, TG; Chen, XY; McElroy, MB; Nielsen, CP; Wu... \n... ... ... \n51897 J Lai, ZL; Liu, W; Jian, XD; Bacsa, K; Sun, LM; ... \n51898 J Wang, HC; Roussel, P; Denby, B \n51899 J Zhang, R; Alpdogan, S; Kong, SQ; Muhammad, S \n51902 J Chu, WP; Song, Y \n51903 J Lai, CS; Jia, YW; Dong, ZK; Wang, DX; Tao, YS;... \n\n Book Authors Book Editors Book Group Authors \n0 NaN NaN NaN \\\n1 NaN NaN NaN \n2 NaN NaN NaN \n3 NaN NaN NaN \n4 NaN NaN NaN \n... ... ... ... \n51897 NaN NaN NaN \n51898 NaN NaN NaN \n51899 NaN NaN NaN \n51902 NaN NaN NaN \n51903 NaN NaN NaN \n\n Author Full Names \n0 Yan, Zheng; Jing, Xuyang; Pedrycz, Witold \\\n1 Sookhak, Mehdi; Yu, F. Richard; He, Ying; Tale... \n2 Ning, Zhaolong; Dong, Peiran; Wang, Xiaojie; G... \n3 Wang, Xiaoding; Garg, Sahil; Lin, Hui; Kaddoum... \n4 Lu, Tianguang; Chen, Xinyu; McElroy, Michael B... \n... ... \n51897 Lai, Zhilu; Liu, Wei; Jian, Xudong; Bacsa, Kir... \n51898 Wang, Hongcui; Roussel, Pierre; Denby, Bruce \n51899 Zhang, Rui; Alpdogan, Serdar; Kong, Shiqi; Muh... \n51902 Chu, Wenping; Song, Yang \n51903 Lai, Chun Sing; Jia, Youwei; Dong, Zhekang; Wa... \n\n Book Author Full Names Group Authors \n0 NaN NaN \\\n1 NaN NaN \n2 NaN NaN \n3 NaN NaN \n4 NaN NaN \n... ... ... \n51897 NaN NaN \n51898 NaN NaN \n51899 NaN NaN \n51902 NaN NaN \n51903 NaN NaN \n\n Article Title \n0 LEFusing and mining opinions for reputation ge... \\\n1 FOG VEHICULAR COMPUTING Augmentation of Fog Co... \n2 Deep Reinforcement Learning for Intelligent In... \n3 An Intelligent UAV based Data Aggregation Algo... \n4 A Reinforcement Learning-Based Decision System... \n... ... \n51897 Neural modal ordinary differential equations: ... \n51898 Improving ultrasound-based multimodal speech r... \n51899 Application of computer-aided image reconstruc... \n51902 Study on Dynamic Interaction of Railway Pantog... \n51903 A Review of Technical Standards for Smart Cities \n\n Source Title ... \n0 INFORMATION FUSION ... \\\n1 IEEE VEHICULAR TECHNOLOGY MAGAZINE ... \n2 IEEE TRANSACTIONS ON COGNITIVE COMMUNICATIONS ... ... \n3 COMPUTER NETWORKS ... \n4 IEEE TRANSACTIONS ON SMART GRID ... \n... ... ... \n51897 DATA-CENTRIC ENGINEERING ... \n51898 JASA EXPRESS LETTERS ... \n51899 EGYPTIAN JOURNAL OF NEUROSURGERY ... \n51902 VIBRATION ... \n51903 CLEAN TECHNOLOGIES ... \n\n UT (Unique WOS ID) issn_var issn Domain_English \n0 WOS:000394070100013 issn 15662535 Applied Sciences \\\n1 WOS:000408568800008 issn 15566072 Applied Sciences \n2 WOS:000502789700018 issn 23327731 Applied Sciences \n3 WOS:000626758800004 issn 13891286 Applied Sciences \n4 WOS:000641976000028 issn 19493053 Applied Sciences \n... ... ... ... ... \n51897 WOS:000906995300001 eissn NaN Applied Sciences \n51898 WOS:000642230800005 eissn NaN Natural Sciences \n51899 WOS:000807222600001 eissn NaN Health Sciences \n51902 WOS:000661660800001 eissn NaN Applied Sciences \n51903 WOS:000708219500008 eissn NaN Natural Sciences \n\n Field_English \n0 Information & Communication Technologies \\\n1 Information & Communication Technologies \n2 Information & Communication Technologies \n3 Information & Communication Technologies \n4 Enabling & Strategic Technologies \n... ... \n51897 Information & Communication Technologies \n51898 Physics & Astronomy \n51899 Clinical Medicine \n51902 Engineering \n51903 Earth & Environmental Sciences \n\n SubField_English 2.00 SEQ \n0 Artificial Intelligence & Image Processing 31 \\\n1 Networking & Telecommunications 37 \n2 Networking & Telecommunications 37 \n3 Networking & Telecommunications 37 \n4 Energy 14 \n... ... ... \n51897 Artificial Intelligence & Image Processing NaN \n51898 Acoustics NaN \n51899 Neurology & Neurosurgery NaN \n51902 Mechanical Engineering & Transports NaN \n51903 Environmental Sciences NaN \n\n Source_title srcid \n0 Information Fusion 2.609900e+04 \\\n1 IEEE Vehicular Technology Magazine 5.200153e+09 \n2 IEEE Transactions on Cognitive Communications ... 2.110085e+10 \n3 Computer Networks 2.681100e+04 \n4 IEEE Transactions on Smart Grid 1.970017e+10 \n... ... ... \n51897 NaN NaN \n51898 NaN NaN \n51899 NaN NaN \n51902 NaN NaN \n51903 NaN NaN \n\n issn_type \n0 issn1 \n1 issn1 \n2 issn1 \n3 issn1 \n4 issn2 \n... ... \n51897 NaN \n51898 NaN \n51899 NaN \n51902 NaN \n51903 NaN \n\n[46060 rows x 80 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Publication TypeAuthorsBook AuthorsBook EditorsBook Group AuthorsAuthor Full NamesBook Author Full NamesGroup AuthorsArticle TitleSource Title...UT (Unique WOS ID)issn_varissnDomain_EnglishField_EnglishSubField_English2.00 SEQSource_titlesrcidissn_type
0JYan, Z; Jing, XY; Pedrycz, WNaNNaNNaNYan, Zheng; Jing, Xuyang; Pedrycz, WitoldNaNNaNLEFusing and mining opinions for reputation ge...INFORMATION FUSION...WOS:000394070100013issn15662535Applied SciencesInformation & Communication TechnologiesArtificial Intelligence & Image Processing31Information Fusion2.609900e+04issn1
1JSookhak, M; Yu, FR; He, Y; Talebian, H; Safa, ...NaNNaNNaNSookhak, Mehdi; Yu, F. Richard; He, Ying; Tale...NaNNaNFOG VEHICULAR COMPUTING Augmentation of Fog Co...IEEE VEHICULAR TECHNOLOGY MAGAZINE...WOS:000408568800008issn15566072Applied SciencesInformation & Communication TechnologiesNetworking & Telecommunications37IEEE Vehicular Technology Magazine5.200153e+09issn1
2JNing, ZL; Dong, PR; Wang, XJ; Guo, L; Rodrigue...NaNNaNNaNNing, Zhaolong; Dong, Peiran; Wang, Xiaojie; G...NaNNaNDeep Reinforcement Learning for Intelligent In...IEEE TRANSACTIONS ON COGNITIVE COMMUNICATIONS ......WOS:000502789700018issn23327731Applied SciencesInformation & Communication TechnologiesNetworking & Telecommunications37IEEE Transactions on Cognitive Communications ...2.110085e+10issn1
3JWang, XD; Garg, S; Lin, H; Kaddoum, G; Hu, J; ...NaNNaNNaNWang, Xiaoding; Garg, Sahil; Lin, Hui; Kaddoum...NaNNaNAn Intelligent UAV based Data Aggregation Algo...COMPUTER NETWORKS...WOS:000626758800004issn13891286Applied SciencesInformation & Communication TechnologiesNetworking & Telecommunications37Computer Networks2.681100e+04issn1
4JLu, TG; Chen, XY; McElroy, MB; Nielsen, CP; Wu...NaNNaNNaNLu, Tianguang; Chen, Xinyu; McElroy, Michael B...NaNNaNA Reinforcement Learning-Based Decision System...IEEE TRANSACTIONS ON SMART GRID...WOS:000641976000028issn19493053Applied SciencesEnabling & Strategic TechnologiesEnergy14IEEE Transactions on Smart Grid1.970017e+10issn2
..................................................................
51897JLai, ZL; Liu, W; Jian, XD; Bacsa, K; Sun, LM; ...NaNNaNNaNLai, Zhilu; Liu, Wei; Jian, Xudong; Bacsa, Kir...NaNNaNNeural modal ordinary differential equations: ...DATA-CENTRIC ENGINEERING...WOS:000906995300001eissnNaNApplied SciencesInformation & Communication TechnologiesArtificial Intelligence & Image ProcessingNaNNaNNaNNaN
51898JWang, HC; Roussel, P; Denby, BNaNNaNNaNWang, Hongcui; Roussel, Pierre; Denby, BruceNaNNaNImproving ultrasound-based multimodal speech r...JASA EXPRESS LETTERS...WOS:000642230800005eissnNaNNatural SciencesPhysics & AstronomyAcousticsNaNNaNNaNNaN
51899JZhang, R; Alpdogan, S; Kong, SQ; Muhammad, SNaNNaNNaNZhang, Rui; Alpdogan, Serdar; Kong, Shiqi; Muh...NaNNaNApplication of computer-aided image reconstruc...EGYPTIAN JOURNAL OF NEUROSURGERY...WOS:000807222600001eissnNaNHealth SciencesClinical MedicineNeurology & NeurosurgeryNaNNaNNaNNaN
51902JChu, WP; Song, YNaNNaNNaNChu, Wenping; Song, YangNaNNaNStudy on Dynamic Interaction of Railway Pantog...VIBRATION...WOS:000661660800001eissnNaNApplied SciencesEngineeringMechanical Engineering & TransportsNaNNaNNaNNaN
51903JLai, CS; Jia, YW; Dong, ZK; Wang, DX; Tao, YS;...NaNNaNNaNLai, Chun Sing; Jia, Youwei; Dong, Zhekang; Wa...NaNNaNA Review of Technical Standards for Smart CitiesCLEAN TECHNOLOGIES...WOS:000708219500008eissnNaNNatural SciencesEarth & Environmental SciencesEnvironmental SciencesNaNNaNNaNNaN
\n

46060 rows × 80 columns

\n
" }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wos" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/plain": "['Domain_English', 'Field_English', 'SubField_English']" }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "metrix_levels" ] }, { "cell_type": "code", "execution_count": 45, "outputs": [], "source": [ "record_countries = locations[[record_col,\"Country\"]].drop_duplicates()\n", "record_author_locations = author_locations[[record_col,\"author_str_id\",\"Country\"]].drop_duplicates()\n", "record_institution = univ_locations[[record_col,\"Institution\",\"Country\"]].drop_duplicates()\n", "country_types = locations[[\"Country\",\"Country_Type\"]].drop_duplicates()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 46, "outputs": [], "source": [ "# Basic network layout" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 47, "outputs": [], "source": [ "country_collabs = record_countries.merge(record_countries, on=record_col)\n", "country_collabs = country_collabs[country_collabs[\"Country_x\"]!=country_collabs[\"Country_y\"]]\n", "country_collabs[\"weight\"] = 0.5" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 48, "outputs": [], "source": [ "inst_collabs = record_institution.merge(record_institution, on=record_col)\n", "inst_collabs = inst_collabs[inst_collabs[\"Institution_x\"]!=inst_collabs[\"Institution_y\"]]\n", "inst_collabs[\"weight\"] = 0.5" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 49, "outputs": [ { "data": { "text/plain": "Index(['Publication Type', 'Authors', 'Book Authors', 'Book Editors',\n 'Book Group Authors', 'Author Full Names', 'Book Author Full Names',\n 'Group Authors', 'Article Title', 'Source Title', 'Book Series Title',\n 'Book Series Subtitle', 'Language', 'Document Type', 'Conference Title',\n 'Conference Date', 'Conference Location', 'Conference Sponsor',\n 'Conference Host', 'Author Keywords', 'Keywords Plus', 'Abstract',\n 'Addresses', 'Affiliations', 'Reprint Addresses', 'Email Addresses',\n 'Researcher Ids', 'ORCIDs', 'Funding Orgs', 'Funding Name Preferred',\n 'Funding Text', 'Cited References', 'Cited Reference Count',\n 'Times Cited, WoS Core', 'Times Cited, All Databases',\n '180 Day Usage Count', 'Since 2013 Usage Count', 'Publisher',\n 'Publisher City', 'Publisher Address', 'ISSN', 'eISSN', 'ISBN',\n 'Journal Abbreviation', 'Journal ISO Abbreviation', 'Publication Date',\n 'Publication Year', 'Volume', 'Issue', 'Part Number', 'Supplement',\n 'Special Issue', 'Meeting Abstract', 'Start Page', 'End Page',\n 'Article Number', 'DOI', 'DOI Link', 'Book DOI', 'Early Access Date',\n 'Number of Pages', 'WoS Categories', 'Web of Science Index',\n 'Research Areas', 'IDS Number', 'Pubmed Id', 'Open Access Designations',\n 'Highly Cited Status', 'Hot Paper Status', 'Date of Export',\n 'UT (Unique WOS ID)', 'issn_var', 'issn', 'Domain_English',\n 'Field_English', 'SubField_English', '2.00 SEQ', 'Source_title',\n 'srcid', 'issn_type'],\n dtype='object')" }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wos.columns" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 50, "outputs": [ { "data": { "text/plain": "['Authors',\n 'Book Authors',\n 'Book Editors',\n 'Book Group Authors',\n 'Author Full Names',\n 'Book Author Full Names',\n 'Group Authors',\n 'Addresses',\n 'Reprint Addresses',\n 'Email Addresses',\n 'Researcher Ids',\n 'ORCIDs',\n 'Publisher Address',\n '2.00 SEQ']" }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drop_cols = [ws for ws in wos.columns if ((\"uthor\" in ws or \"ddress\" in ws or \"ORCID\" in\n", " ws or \"esearcher\" in ws or \"ditor\" in ws or \"name\" in ws or 'SEQ' in ws) and \"eyword\" not in ws)]\n", "drop_cols" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 51, "outputs": [], "source": [ "outdir=\"wos_processed_data\"" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 52, "outputs": [], "source": [ "os.makedirs(outdir, exist_ok=True)\n", "\n", "wos.drop(columns=drop_cols).to_excel(f\"{outdir}/wos_processed.xlsx\", index=False)\n", "\n", "record_countries.to_excel(f\"{outdir}/wos_countries.xlsx\", index=False)\n", "\n", "record_author_locations.to_excel(f\"{outdir}/wos_author_locations.xlsx\", index=False)\n", "\n", "record_institution.to_excel(f\"{outdir}/wos_institution_locations.xlsx\", index=False)\n", "\n", "kw_df.to_excel(f\"{outdir}/wos_keywords.xlsx\", index=False)\n", "\n", "country_types.to_excel(f\"{outdir}/wos_country_types.xlsx\", index=False)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 53, "outputs": [], "source": [ "wos.drop(columns=drop_cols).to_csv(f\"{outdir}/wos_processed.csv\", index=False, sep='\\t')\n", "\n", "record_countries.to_csv(f\"{outdir}/wos_countries.csv\", index=False, sep='\\t')\n", "\n", "record_author_locations.to_csv(f\"{outdir}/wos_author_locations.csv\", index=False, sep='\\t')\n", "\n", "record_institution.to_csv(f\"{outdir}/wos_institution_locations.csv\", index=False, sep='\\t')\n", "\n", "kw_df.to_csv(f\"{outdir}/wos_keywords.csv\", index=False, sep='\\t')\n", "\n", "country_types.to_csv(f\"{outdir}/wos_country_types.csv\", index=False, sep='\\t')\n", "\n", "inst_collabs.to_csv(f\"{outdir}/wos_inst_collabs.csv\", index=False, sep='\\t')\n", "\n", "country_collabs.to_csv(f\"{outdir}/wos_country_collabs.csv\", index=False, sep='\\t')" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 54, "outputs": [], "source": [ "wos_areas.to_csv(f\"{outdir}/wos_research_areas.csv\", index=False, sep='\\t')\n", "\n", "wos_subcat.to_csv(f\"{outdir}/wos_categories.csv\", index=False, sep='\\t')" ], "metadata": { "collapsed": false } } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16" } }, "nbformat": 4, "nbformat_minor": 1 }