{ "cells": [ { "cell_type": "code", "execution_count": 1, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import os\n", "import shutil\n", "from flashgeotext.geotext import GeoText\n", "import re" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 2, "outputs": [], "source": [ "import hashlib\n", "\n", "def md5hash(s: str):\n", " return hashlib.md5(s.encode('utf-8')).hexdigest()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "record_col=\"UT (Unique WOS ID)\"\n", "outfile = r\"C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\WOS\\wos_extract\\wos_records_concat.csv\"" ] }, { "cell_type": "code", "execution_count": 3, "outputs": [], "source": [], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": " Publication Type Authors \n29758 C Fu, YC; Liu, YH; Gao, ZW \\\n34098 J Han, D; Zhang, CH; Fauconnier, ML \n55478 C Xu, YX; Liu, M; Peng, L; Zhang, JQ; Zheng, YW \n32260 C Liu, Q; Cai, WD; Fu, ZJ; Shen, J; Linge, N \n8751 J Shamshirband, S; Nodoushan, EJ; Adolf, JE; Man... \n... ... ... \n6151 C Seufert, M; Casas, P; Wehner, N; Gang, L; Li, K \n32052 J Huber, A; Kinna, D; Huber, V; Arnoux, G; Balbo... \n27985 J Dong, GP; Ma, J; Kwan, MP; Wang, YM; Chai, YW \n2939 J Yin, ZY; Jin, YF; Huang, HW; Shen, SL \n34651 J Wang, JH; Lindenbergh, R; Menenti, M \n\n Book Authors Book Editors \n29758 NaN Yu, H \\\n34098 NaN NaN \n55478 NaN NaN \n32260 NaN Fang, WC; Vasilakos, T; Stoica, A; Kwak, YS \n8751 NaN NaN \n... ... ... \n6151 NaN Galis, A; Guillemin, F; Noldus, R; Secci, S; I... \n32052 NaN NaN \n27985 NaN NaN \n2939 NaN NaN \n34651 NaN NaN \n\n Book Group Authors Author Full Names \n29758 NaN Fu, Yichuan; Liu, Yuanhong; Gao, Zhiwei \\\n34098 NaN Han, Dong; Zhang, Chun-Hui; Fauconnier, Marie-... \n55478 IEEE Xu, Yuxuan; Liu, Ming; Peng, Linning; Zhang, J... \n32260 NaN Liu, Qi; Cai, Weidong; Fu, Zhangjie; Shen, Jia... \n8751 NaN Shamshirband, Shahaboddin; Nodoushan, Ehsan Ja... \n... ... ... \n6151 NaN Seufert, Michael; Casas, Pedro; Wehner, Nikola... \n32052 NaN Huber, A.; Kinna, D.; Huber, V.; Arnoux, G.; B... \n27985 NaN Dong, Guanpeng; Ma, Jing; Kwan, Mei-Po; Wang, ... \n2939 NaN Yin Zhen-Yu; Jin Yin-Fu; Huang Hong-Wei; Shen ... \n34651 NaN Wang, Jinhu; Lindenbergh, Roderik; Menenti, Ma... \n\n Book Author Full Names Group Authors \n29758 NaN NaN \\\n34098 NaN NaN \n55478 NaN NaN \n32260 NaN NaN \n8751 NaN NaN \n... ... ... \n6151 NaN NaN \n32052 NaN JET Contributors \n27985 NaN NaN \n2939 NaN NaN \n34651 NaN NaN \n\n Article Title \n29758 Multiple Actuator Fault Classification in Wind... \\\n34098 Effect of Seasoning Addition on Volatile Compo... \n55478 Colluding RF Fingerprint Impersonation Attack ... \n32260 An Optimized Strategy for Speculative Executio... \n8751 Ensemble models with uncertainty analysis for ... \n... ... \n6151 Stream-based Machine Learning for Real-time Qo... \n32052 The near infrared imaging system for the real-... \n27985 Multi-level temporal autoregressive modelling ... \n2939 Evolutionary polynomial regression based model... \n34651 SigVox - A 3D feature matching algorithm for a... \n\n Source Title ... \n29758 2019 25TH IEEE INTERNATIONAL CONFERENCE ON AUT... ... \\\n34098 FOODS ... \n55478 IEEE INTERNATIONAL CONFERENCE ON COMMUNICATION... ... \n32260 2015 9TH INTERNATIONAL CONFERENCE ON FUTURE GE... ... \n8751 ENGINEERING APPLICATIONS OF COMPUTATIONAL FLUI... ... \n... ... ... \n6151 PROCEEDINGS OF THE 2019 22ND CONFERENCE ON INN... ... \n32052 PHYSICA SCRIPTA ... \n27985 INTERNATIONAL JOURNAL OF GEOGRAPHICAL INFORMAT... ... \n2939 ENGINEERING GEOLOGY ... \n34651 ISPRS JOURNAL OF PHOTOGRAMMETRY AND REMOTE SEN... ... \n\n WoS Categories \n29758 Automation & Control Systems; Computer Science... \\\n34098 Food Science & Technology \n55478 Telecommunications \n32260 Computer Science, Hardware & Architecture \n8751 Engineering, Multidisciplinary; Engineering, M... \n... ... \n6151 Computer Science, Hardware & Architecture; Com... \n32052 Physics, Multidisciplinary \n27985 Computer Science, Information Systems; Geograp... \n2939 Engineering, Geological; Geosciences, Multidis... \n34651 Geography, Physical; Geosciences, Multidiscipl... \n\n Web of Science Index \n29758 Conference Proceedings Citation Index - Scienc... \\\n34098 Science Citation Index Expanded (SCI-EXPANDED) \n55478 Conference Proceedings Citation Index - Scienc... \n32260 Conference Proceedings Citation Index - Scienc... \n8751 Science Citation Index Expanded (SCI-EXPANDED) \n... ... \n6151 Conference Proceedings Citation Index - Scienc... \n32052 Science Citation Index Expanded (SCI-EXPANDED)... \n27985 Science Citation Index Expanded (SCI-EXPANDED)... \n2939 Science Citation Index Expanded (SCI-EXPANDED) \n34651 Science Citation Index Expanded (SCI-EXPANDED) \n\n Research Areas IDS Number \n29758 Automation & Control Systems; Computer Science BP9AN \\\n34098 Food Science & Technology PV8DT \n55478 Telecommunications BT9VG \n32260 Computer Science BF1GE \n8751 Engineering; Mechanics HE2WU \n... ... ... \n6151 Computer Science BM8PP \n32052 Physics FL3JX \n27985 Computer Science; Geography; Physical Geograph... GS7LK \n2939 Engineering; Geology DS2IG \n34651 Physical Geography; Geology; Remote Sensing; I... EX2BV \n\n Pubmed Id Open Access Designations Highly Cited Status \n29758 NaN NaN NaN \\\n34098 33406625.0 gold, Green Published NaN \n55478 NaN NaN NaN \n32260 NaN NaN NaN \n8751 NaN Green Published, gold Y \n... ... ... ... \n6151 NaN NaN NaN \n32052 NaN NaN NaN \n27985 NaN hybrid, Green Published NaN \n2939 NaN NaN NaN \n34651 NaN NaN NaN \n\n Hot Paper Status Date of Export UT (Unique WOS ID) \n29758 NaN 2023-04-28 WOS:000568623100060 \n34098 NaN 2023-04-28 WOS:000610212800001 \n55478 NaN 2023-04-28 WOS:000864709903078 \n32260 NaN 2023-04-28 WOS:000380393500003 \n8751 N 2023-04-28 WOS:000453212200001 \n... ... ... ... \n6151 NaN 2023-04-28 WOS:000469794500014 \n32052 NaN 2023-04-28 WOS:000414120500027 \n27985 NaN 2023-04-28 WOS:000443882300004 \n2939 NaN 2023-04-28 WOS:000380592100015 \n34651 NaN 2023-04-28 WOS:000403031400010 \n\n[100 rows x 71 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Publication TypeAuthorsBook AuthorsBook EditorsBook Group AuthorsAuthor Full NamesBook Author Full NamesGroup AuthorsArticle TitleSource Title...WoS CategoriesWeb of Science IndexResearch AreasIDS NumberPubmed IdOpen Access DesignationsHighly Cited StatusHot Paper StatusDate of ExportUT (Unique WOS ID)
29758CFu, YC; Liu, YH; Gao, ZWNaNYu, HNaNFu, Yichuan; Liu, Yuanhong; Gao, ZhiweiNaNNaNMultiple Actuator Fault Classification in Wind...2019 25TH IEEE INTERNATIONAL CONFERENCE ON AUT......Automation & Control Systems; Computer Science...Conference Proceedings Citation Index - Scienc...Automation & Control Systems; Computer ScienceBP9ANNaNNaNNaNNaN2023-04-28WOS:000568623100060
34098JHan, D; Zhang, CH; Fauconnier, MLNaNNaNNaNHan, Dong; Zhang, Chun-Hui; Fauconnier, Marie-...NaNNaNEffect of Seasoning Addition on Volatile Compo...FOODS...Food Science & TechnologyScience Citation Index Expanded (SCI-EXPANDED)Food Science & TechnologyPV8DT33406625.0gold, Green PublishedNaNNaN2023-04-28WOS:000610212800001
55478CXu, YX; Liu, M; Peng, L; Zhang, JQ; Zheng, YWNaNNaNIEEEXu, Yuxuan; Liu, Ming; Peng, Linning; Zhang, J...NaNNaNColluding RF Fingerprint Impersonation Attack ...IEEE INTERNATIONAL CONFERENCE ON COMMUNICATION......TelecommunicationsConference Proceedings Citation Index - Scienc...TelecommunicationsBT9VGNaNNaNNaNNaN2023-04-28WOS:000864709903078
32260CLiu, Q; Cai, WD; Fu, ZJ; Shen, J; Linge, NNaNFang, WC; Vasilakos, T; Stoica, A; Kwak, YSNaNLiu, Qi; Cai, Weidong; Fu, Zhangjie; Shen, Jia...NaNNaNAn Optimized Strategy for Speculative Executio...2015 9TH INTERNATIONAL CONFERENCE ON FUTURE GE......Computer Science, Hardware & ArchitectureConference Proceedings Citation Index - Scienc...Computer ScienceBF1GENaNNaNNaNNaN2023-04-28WOS:000380393500003
8751JShamshirband, S; Nodoushan, EJ; Adolf, JE; Man...NaNNaNNaNShamshirband, Shahaboddin; Nodoushan, Ehsan Ja...NaNNaNEnsemble models with uncertainty analysis for ...ENGINEERING APPLICATIONS OF COMPUTATIONAL FLUI......Engineering, Multidisciplinary; Engineering, M...Science Citation Index Expanded (SCI-EXPANDED)Engineering; MechanicsHE2WUNaNGreen Published, goldYN2023-04-28WOS:000453212200001
..................................................................
6151CSeufert, M; Casas, P; Wehner, N; Gang, L; Li, KNaNGalis, A; Guillemin, F; Noldus, R; Secci, S; I...NaNSeufert, Michael; Casas, Pedro; Wehner, Nikola...NaNNaNStream-based Machine Learning for Real-time Qo...PROCEEDINGS OF THE 2019 22ND CONFERENCE ON INN......Computer Science, Hardware & Architecture; Com...Conference Proceedings Citation Index - Scienc...Computer ScienceBM8PPNaNNaNNaNNaN2023-04-28WOS:000469794500014
32052JHuber, A; Kinna, D; Huber, V; Arnoux, G; Balbo...NaNNaNNaNHuber, A.; Kinna, D.; Huber, V.; Arnoux, G.; B...NaNJET ContributorsThe near infrared imaging system for the real-...PHYSICA SCRIPTA...Physics, MultidisciplinaryScience Citation Index Expanded (SCI-EXPANDED)...PhysicsFL3JXNaNNaNNaNNaN2023-04-28WOS:000414120500027
27985JDong, GP; Ma, J; Kwan, MP; Wang, YM; Chai, YWNaNNaNNaNDong, Guanpeng; Ma, Jing; Kwan, Mei-Po; Wang, ...NaNNaNMulti-level temporal autoregressive modelling ...INTERNATIONAL JOURNAL OF GEOGRAPHICAL INFORMAT......Computer Science, Information Systems; Geograp...Science Citation Index Expanded (SCI-EXPANDED)...Computer Science; Geography; Physical Geograph...GS7LKNaNhybrid, Green PublishedNaNNaN2023-04-28WOS:000443882300004
2939JYin, ZY; Jin, YF; Huang, HW; Shen, SLNaNNaNNaNYin Zhen-Yu; Jin Yin-Fu; Huang Hong-Wei; Shen ...NaNNaNEvolutionary polynomial regression based model...ENGINEERING GEOLOGY...Engineering, Geological; Geosciences, Multidis...Science Citation Index Expanded (SCI-EXPANDED)Engineering; GeologyDS2IGNaNNaNNaNNaN2023-04-28WOS:000380592100015
34651JWang, JH; Lindenbergh, R; Menenti, MNaNNaNNaNWang, Jinhu; Lindenbergh, Roderik; Menenti, Ma...NaNNaNSigVox - A 3D feature matching algorithm for a...ISPRS JOURNAL OF PHOTOGRAMMETRY AND REMOTE SEN......Geography, Physical; Geosciences, Multidiscipl...Science Citation Index Expanded (SCI-EXPANDED)Physical Geography; Geology; Remote Sensing; I...EX2BVNaNNaNNaNNaN2023-04-28WOS:000403031400010
\n

100 rows × 71 columns

\n
" }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wos = pd.read_csv(outfile, sep=\"\\t\",low_memory=False)\n", "wos.sample(100)" ] }, { "cell_type": "code", "execution_count": 5, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of initial (valid interval) records: 56196\n" ] }, { "data": { "text/plain": " Domain_English Field_English \n0 Applied Sciences Agriculture, Fisheries & Forestry \\\n1 Applied Sciences Agriculture, Fisheries & Forestry \n2 Applied Sciences Agriculture, Fisheries & Forestry \n3 Applied Sciences Agriculture, Fisheries & Forestry \n4 Applied Sciences Agriculture, Fisheries & Forestry \n\n SubField_English 2.00 SEQ Source_title srcid \n0 Agronomy & Agriculture 1 Annals of Biology 13016 \\\n1 Agronomy & Agriculture 1 Advances in Agronomy 14324 \n2 Agronomy & Agriculture 1 European Journal of Soil Biology 14648 \n3 Agronomy & Agriculture 1 Soil Biology and Biochemistry 14802 \n4 Agronomy & Agriculture 1 Agricultura Tecnica 14972 \n\n issn_type issn \n0 issn1 09700153 \n1 issn1 00652113 \n2 issn1 11645563 \n3 issn1 00380717 \n4 issn1 03652807 ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Domain_EnglishField_EnglishSubField_English2.00 SEQSource_titlesrcidissn_typeissn
0Applied SciencesAgriculture, Fisheries & ForestryAgronomy & Agriculture1Annals of Biology13016issn109700153
1Applied SciencesAgriculture, Fisheries & ForestryAgronomy & Agriculture1Advances in Agronomy14324issn100652113
2Applied SciencesAgriculture, Fisheries & ForestryAgronomy & Agriculture1European Journal of Soil Biology14648issn111645563
3Applied SciencesAgriculture, Fisheries & ForestryAgronomy & Agriculture1Soil Biology and Biochemistry14802issn100380717
4Applied SciencesAgriculture, Fisheries & ForestryAgronomy & Agriculture1Agricultura Tecnica14972issn103652807
\n
" }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "wos = wos[((wos[\"Publication Year\"]<2023)&(wos[\"Publication Year\"]>2010))].copy()\n", "print(f'Number of initial (valid interval) records: {len(wos)}')\n", "\n", "metrix = pd.read_excel(\"sm_journal_classification.xlsx\", sheet_name=\"Journal_Classification\")\n", "\n", "\n", "metrix = metrix.set_index([c for c in metrix.columns if \"issn\" not in c]).stack().reset_index()\n", "metrix = metrix.rename(columns={'level_6':\"issn_type\", 0:\"issn\"})\n", "metrix[\"issn\"]=metrix[\"issn\"].str.replace(\"-\",\"\").str.lower().str.strip()\n", "metrix.head()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 6, "outputs": [ { "data": { "text/plain": "Domain_English 6\nField_English 21\nSubField_English 175\ndtype: int64" }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "metrix[[\"Domain_English\",\"Field_English\",\"SubField_English\"]].nunique()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 4, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of initial (valid interval) records: 56196\n", "Number of METRIX filtered records: 49854\n", "Number of unindexed records: 2984\n", "Number of filtered records (dropping duplicates): 49839\n" ] } ], "source": [ "\n", "wos[\"issn\"] = wos[\"ISSN\"].str.replace(\"-\",\"\").str.lower().str.strip()\n", "wos[\"eissn\"] = wos[\"eISSN\"].str.replace(\"-\",\"\").str.lower().str.strip()\n", "wos = wos.set_index([c for c in wos.columns if \"issn\" not in c]).stack().reset_index()\n", "wos = wos.rename(columns={'level_71':\"issn_var\", 0:\"issn\"})\n", "\n", "wos_merge = wos.merge(metrix, on=\"issn\", how=\"left\")\n", "\n", "\n", "\n", "wos_indexed = wos_merge[~wos_merge[\"Domain_English\"].isna()]\n", "wos_unindexed = wos_merge[~wos_merge[record_col].isin(wos_indexed[record_col])]\n", "\n", "\n", "wos_unindexed = wos_unindexed.sort_values(by=[\"issn_var\"],ascending=False).drop_duplicates(subset=record_col)\n", "wos = wos_indexed.sort_values(by=[\"issn_var\"],ascending=False).drop_duplicates(subset=record_col)\n", "\n", "wos_postmerge = wos.copy()\n", "print(f'Number of METRIX filtered records: {len(wos)}')\n", "print(f'Number of unindexed records: {len(wos_unindexed)}')\n", "\n", "# drop entries not indexed by metrix\n", "# drop duplicates (based on doi)\n", "wos = wos[~((~wos[\"DOI\"].isna())&(wos[\"DOI\"].duplicated(False)))]\n", "wos = wos.drop_duplicates(subset=[\"Publication Type\",\"Document Type\",\"Authors\",\"Article Title\",\"Source Title\",\"Publication Year\"])\n", "print(f'Number of filtered records (dropping duplicates): {len(wos)}')" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 5, "outputs": [ { "data": { "text/plain": "Domain_English\nApplied Sciences 31871\nNatural Sciences 9542\nHealth Sciences 5942\nEconomic & Social Sciences 1468\narticle-level classification 940\nArts & Humanities 76\nName: count, dtype: int64" }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wos[\"Domain_English\"].value_counts()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 6, "outputs": [], "source": [ "wos_classifier = wos[[\"WoS Categories\",\"Research Areas\"]+list(metrix.columns)].copy().drop_duplicates()\n", "wos_classifier = wos_classifier.groupby([\"WoS Categories\",\"Research Areas\"], as_index=False)[[\"Domain_English\",\"Field_English\",\"SubField_English\"]].agg(\n", " lambda x: pd.Series.mode(x)[0])" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 7, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Found: 2065 \n", "Lost forever: 919\n" ] } ], "source": [ "wos_to_reindex = wos_unindexed.drop(columns=list(metrix.columns))\n", "wos_found = wos_to_reindex.merge(wos_classifier, on=[\"WoS Categories\",\"Research Areas\"], how=\"inner\")\n", "# wos_found = wos_to_reindex.merge(wos_classifier, on=\"Research Areas\", how=\"inner\")\n", "# # wos_found = wos_to_reindex.merge(wos_classifier, on=\"WoS Categories\", how=\"inner\")\n", "wos_stillost = wos_unindexed[~wos_unindexed[record_col].isin(wos_found[record_col])]\n", "\n", "print(\"Found:\", wos_found[record_col].nunique(),\"\\nLost forever:\", wos_stillost[record_col].nunique())" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 8, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of records (after remerge): 51904\n" ] } ], "source": [ "wos = pd.concat([wos,wos_found], ignore_index=True)\n", "print(f'Number of records (after remerge): {len(wos)}')" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 9, "outputs": [ { "data": { "text/plain": "Domain_English\nApplied Sciences 33720\nNatural Sciences 9617\nHealth Sciences 6002\nEconomic & Social Sciences 1533\narticle-level classification 955\nArts & Humanities 77\nName: count, dtype: int64" }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wos[\"Domain_English\"].value_counts()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 10, "outputs": [ { "data": { "text/plain": "WoS Categories\nEngineering, Electrical & Electronic 13661\nComputer Science, Artificial Intelligence 7760\nComputer Science, Information Systems 6481\nTelecommunications 5560\nComputer Science, Theory & Methods 3597\n ... \nMusic 1\nCultural Studies 1\nPsychology, Psychoanalysis 1\nAsian Studies 1\nAndrology 1\nName: count, Length: 236, dtype: int64" }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wos_cat = wos.groupby(record_col)[\"WoS Categories\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n", "wos_cat[\"WoS Categories\"] = wos_cat[\"WoS Categories\"].str.strip()\n", "wos_cat[\"WoS Categories\"].value_counts()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 11, "outputs": [ { "data": { "text/plain": "WoS Category\nEngineering 20126\nComputer Science 17613\nTelecommunications 5560\nImaging Science & Photographic Technology 3295\nAutomation & Control Systems 3232\n ... \nMusic 1\nAndrology 1\nLiterature 1\nCultural Studies 1\nAsian Studies 1\nName: count, Length: 177, dtype: int64" }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wos_subcat = wos_cat.copy()\n", "wos_subcat[['WoS Category', 'WoS SubCategory']] = wos_subcat[\"WoS Categories\"].str.split(\",\", expand = True, n=1)\n", "for c in ['WoS Category', 'WoS SubCategory',\"WoS Categories\"]:\n", " wos_subcat[c] = wos_subcat[c].str.strip()\n", "wos_subcat.drop_duplicates(subset=[record_col,'WoS Category'])[\"WoS Category\"].value_counts()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 12, "outputs": [ { "data": { "text/plain": "Research Areas\nEngineering 20176\nComputer Science 17613\nTelecommunications 5560\nEnvironmental Sciences & Ecology 3732\nImaging Science & Photographic Technology 3295\n ... \nLiterature 1\nWomen's Studies 1\nCultural Studies 1\nAsian Studies 1\nMusic 1\nName: count, Length: 147, dtype: int64" }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wos_areas = wos.groupby(record_col)[\"Research Areas\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n", "wos_areas[\"Research Areas\"] = wos_areas[\"Research Areas\"].str.strip()\n", "wos_areas[\"Research Areas\"].value_counts()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 12, "outputs": [], "source": [], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 13, "outputs": [ { "data": { "text/plain": " Article Title \n24862 Kinematic self-calibration of non-contact five... \\\n6623 Optimizing Color Assignment for Perception of ... \n20728 CFD modeling of biomass combustion and gasific... \n41245 Redshift-space distortions in f(R) gravity \n12373 Executable Knowledge Graphs for Machine Learni... \n... ... \n11117 Biochar amendment mitigated N2O emissions from... \n47975 Adaptive Noise Reduction for Sound Event Detec... \n4599 NVM Storage in IoT Devices: Opportunities and ... \n40609 FABNet: Fusion Attention Block and Transfer Le... \n45199 Tea Category Identification Using a Novel Frac... \n\n Keywords Plus \n24862 POSE MEASUREMENT; PARALLEL; MANIPULATOR \\\n6623 OPTIMIZATION; DIFFERENCE \n20728 DISCRETE PARTICLE SIMULATION; STEAM GASIFICATI... \n41245 DARK ENERGY SURVEY; REAL-SPACE; GROWTH; DENSIT... \n12373 NaN \n... ... \n11117 NITROUS-OXIDE EMISSIONS; NITRIFIER DENITRIFICA... \n47975 NONNEGATIVE MATRIX FACTORIZATION; SOURCE SEPAR... \n4599 ENCRYPTED DATA; ACCESS-CONTROL; INTERNET; MEMO... \n40609 NUCLEI \n45199 LEARNING-BASED OPTIMIZATION; PATHOLOGICAL BRAI... \n\n Author Keywords \n24862 kinematic self-calibration; five-axis measurin... \n6623 Color perception; visual design; scatterplots \n20728 Biomass combustion and gasification; CFD simul... \n41245 cosmology: theory; dark energy; large-scale st... \n12373 Knowledge graph; Machine learning; Data analyt... \n... ... \n11117 Biochar; Nitrite accumulation; Nitrous oxide; ... \n47975 sound event detection; non-stationary noise; w... \n4599 IoT; NVM; storage system; energy efficiency; s... \n40609 Cancer; Analytical models; Transfer learning; ... \n45199 tea-category identification; fractional Fourie... \n\n[100 rows x 3 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Article TitleKeywords PlusAuthor Keywords
24862Kinematic self-calibration of non-contact five...POSE MEASUREMENT; PARALLEL; MANIPULATORkinematic self-calibration; five-axis measurin...
6623Optimizing Color Assignment for Perception of ...OPTIMIZATION; DIFFERENCEColor perception; visual design; scatterplots
20728CFD modeling of biomass combustion and gasific...DISCRETE PARTICLE SIMULATION; STEAM GASIFICATI...Biomass combustion and gasification; CFD simul...
41245Redshift-space distortions in f(R) gravityDARK ENERGY SURVEY; REAL-SPACE; GROWTH; DENSIT...cosmology: theory; dark energy; large-scale st...
12373Executable Knowledge Graphs for Machine Learni...NaNKnowledge graph; Machine learning; Data analyt...
............
11117Biochar amendment mitigated N2O emissions from...NITROUS-OXIDE EMISSIONS; NITRIFIER DENITRIFICA...Biochar; Nitrite accumulation; Nitrous oxide; ...
47975Adaptive Noise Reduction for Sound Event Detec...NONNEGATIVE MATRIX FACTORIZATION; SOURCE SEPAR...sound event detection; non-stationary noise; w...
4599NVM Storage in IoT Devices: Opportunities and ...ENCRYPTED DATA; ACCESS-CONTROL; INTERNET; MEMO...IoT; NVM; storage system; energy efficiency; s...
40609FABNet: Fusion Attention Block and Transfer Le...NUCLEICancer; Analytical models; Transfer learning; ...
45199Tea Category Identification Using a Novel Frac...LEARNING-BASED OPTIMIZATION; PATHOLOGICAL BRAI...tea-category identification; fractional Fourie...
\n

100 rows × 3 columns

\n
" }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wos[[\"Article Title\",\"Keywords Plus\",\"Author Keywords\"]].sample(100)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 14, "outputs": [ { "data": { "text/plain": " UT (Unique WOS ID) keyword_all\n0 WOS:000208837000001 NANOINDENTATION\n1 WOS:000208837000001 HARDNESS\n2 WOS:000208837000001 PLASMA-SPRAYED COATING\n3 WOS:000208837000001 INVERSE ANALYSIS\n4 WOS:000208837000001 NUMERICAL METHOD\n.. ... ...\n97 WOS:000209571700012 PERSONALIZED MEDICINE\n98 WOS:000209571700012 COMPLEX NETWORK\n99 WOS:000209571700012 CLINICAL PHENOTYPE NETWORK\n100 WOS:000209571700012 TRADITIONAL CHINESE MEDICINE\n101 WOS:000209617200002 PHYLLOSCOPIDAE\n\n[100 rows x 2 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
UT (Unique WOS ID)keyword_all
0WOS:000208837000001NANOINDENTATION
1WOS:000208837000001HARDNESS
2WOS:000208837000001PLASMA-SPRAYED COATING
3WOS:000208837000001INVERSE ANALYSIS
4WOS:000208837000001NUMERICAL METHOD
.........
97WOS:000209571700012PERSONALIZED MEDICINE
98WOS:000209571700012COMPLEX NETWORK
99WOS:000209571700012CLINICAL PHENOTYPE NETWORK
100WOS:000209571700012TRADITIONAL CHINESE MEDICINE
101WOS:000209617200002PHYLLOSCOPIDAE
\n

100 rows × 2 columns

\n
" }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kw_df = pd.DataFrame()\n", "for c in [\"Keywords Plus\",\"Author Keywords\"]:\n", " kwp = wos.groupby(record_col)[c].apply(lambda x: x.str.split(';')).explode().str.strip().str.upper()\n", " kwp.name = 'keyword_all'\n", " kw_df = pd.concat([kwp.reset_index(),kw_df],ignore_index=True)\n", "kw_df = kw_df[~kw_df[\"keyword_all\"].isna()].copy().drop(columns=\"level_1\").drop_duplicates()\n", "kw_df[\"keyword_all\"] = kw_df[\"keyword_all\"].apply(lambda x: re.sub(\"[\\(\\[].*?[\\)\\]]\", \"\", x))\n", "kw_df.head(100)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 15, "outputs": [ { "data": { "text/plain": " UT (Unique WOS ID) keyword_all\n0 WOS:000208837000001 NANOINDENTATION; HARDNESS; PLASMA-SPRAYED COAT...\n1 WOS:000208863600013 COMPARATIVE GENOMICS; ANAMMOX; KUENENIA STUTTG...\n2 WOS:000208863600266 ANME; PYROSEQUENCING; AOM; COMMUNITY STRUCTURE...\n3 WOS:000208863900217 DEFAULT MODE NETWORK; EFFECTIVE CONNECTIVITY; ...\n4 WOS:000208935500007 ESTIMATION; SOURCE TERM; QUASI STEADY; THE ITE...", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
UT (Unique WOS ID)keyword_all
0WOS:000208837000001NANOINDENTATION; HARDNESS; PLASMA-SPRAYED COAT...
1WOS:000208863600013COMPARATIVE GENOMICS; ANAMMOX; KUENENIA STUTTG...
2WOS:000208863600266ANME; PYROSEQUENCING; AOM; COMMUNITY STRUCTURE...
3WOS:000208863900217DEFAULT MODE NETWORK; EFFECTIVE CONNECTIVITY; ...
4WOS:000208935500007ESTIMATION; SOURCE TERM; QUASI STEADY; THE ITE...
\n
" }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wos_kwd_concat = kw_df.groupby(record_col, as_index=False).agg({'keyword_all': '; '.join})\n", "wos_kwd_concat.head()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 15, "outputs": [], "source": [], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 16, "outputs": [ { "data": { "text/plain": "Index(['Publication Type', 'Authors', 'Book Authors', 'Book Editors',\n 'Book Group Authors', 'Author Full Names', 'Book Author Full Names',\n 'Group Authors', 'Article Title', 'Source Title', 'Book Series Title',\n 'Book Series Subtitle', 'Language', 'Document Type', 'Conference Title',\n 'Conference Date', 'Conference Location', 'Conference Sponsor',\n 'Conference Host', 'Author Keywords', 'Keywords Plus', 'Abstract',\n 'Addresses', 'Affiliations', 'Reprint Addresses', 'Email Addresses',\n 'Researcher Ids', 'ORCIDs', 'Funding Orgs', 'Funding Name Preferred',\n 'Funding Text', 'Cited References', 'Cited Reference Count',\n 'Times Cited, WoS Core', 'Times Cited, All Databases',\n '180 Day Usage Count', 'Since 2013 Usage Count', 'Publisher',\n 'Publisher City', 'Publisher Address', 'ISSN', 'eISSN', 'ISBN',\n 'Journal Abbreviation', 'Journal ISO Abbreviation', 'Publication Date',\n 'Publication Year', 'Volume', 'Issue', 'Part Number', 'Supplement',\n 'Special Issue', 'Meeting Abstract', 'Start Page', 'End Page',\n 'Article Number', 'DOI', 'DOI Link', 'Book DOI', 'Early Access Date',\n 'Number of Pages', 'WoS Categories', 'Web of Science Index',\n 'Research Areas', 'IDS Number', 'Pubmed Id', 'Open Access Designations',\n 'Highly Cited Status', 'Hot Paper Status', 'Date of Export',\n 'UT (Unique WOS ID)', 'issn_var', 'issn', 'Domain_English',\n 'Field_English', 'SubField_English', '2.00 SEQ', 'Source_title',\n 'srcid', 'issn_type'],\n dtype='object')" }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wos.columns" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "geotext = GeoText()\n", "\n", "def extract_location(input_text, key='countries'):\n", " anomalies = {\"Malta\":\"Malta\",\n", " \"Mongolia\":\"Mongolia\",\n", " \"Quatar\":\"Qatar\",\n", " \"Qatar\":\"Qatar\",\n", " \"Ethiop\":\"Ethiopia\",\n", " \"Nigeria\":\"Nigeria\",\n", " \"BELAR\":\"Belarus\",\n", " \"Venezuela\":\"Venezuela\",\n", " \"Cyprus\":\"Cyprus\",\n", " \"Ecuador\":\"Ecuador\",\n", " \"U Arab\":\"United Arab Emirates\",\n", " \"Syria\":\"Syria\",\n", " \"Uganda\":\"Uganda\",\n", " \"Yemen\":\"Yemen\",\n", " \"Mali\":\"Mali\",\n", " \"Senegal\":\"Senegal\",\n", " \"Vatican\":\"Vatican\",\n", " \"Uruguay\":\"Uruguay\",\n", " \"Panama\":\"Panama\",\n", " \"Fiji\":\"Fiji\",\n", " \"Faroe\":\"Faroe Islands\",\n", " \"Macedonia\":\"Macedonia\",\n", " 'Mozambique':'Mozambique',\n", " \"Kuwait\":\"Kuwait\",\n", " \"Libya\":\"Libya\",\n", " \"Turkiy\":\"Turkey\",\n", " \"Liberia\":\"Liberia\",\n", " \"Namibia\":\"Namibia\",\n", " \"Ivoire\":\"Ivory Coast\",\n", " \"Guatemala\":\"Gutemala\",\n", " \"Paraguay\":\"Paraguay\",\n", " \"Honduras\":\"Honduras\",\n", " \"Nicaragua\":\"Nicaragua\",\n", " \"Trinidad\":\"Trinidad & Tobago\",\n", " \"Liechtenstein\":\"Liechtenstein\",\n", " \"Greenland\":\"Denmark\"}\n", "\n", " extracted = geotext.extract(input_text=input_text)\n", " found = extracted[key].keys()\n", " if len(sorted(found))>0:\n", " return sorted(found)[0]\n", " elif key=='countries':\n", " for i in ['Scotland','Wales','England', 'N Ireland']:\n", " if i in input_text:\n", " return 'United Kingdom'\n", " for j in anomalies.keys():\n", " if j in input_text:\n", " return anomalies.get(j)\n", " else:\n", " return None\n", "\n", "with open('../eu_members.txt',\"r\") as f:\n", " eu_countries=f.readline().split(\",\")\n", " eu_countries=[i.strip() for i in eu_countries]\n", "\n", "def country_cleanup(country):\n", " if \"USA\" in country:\n", " return \"USA\"\n", " elif \"China\" in country:\n", " return \"China\"\n", " elif country in [\"England\", \"Northern Ireland\", \"Wales\", \"Scotland\",\"N Ireland\"]:\n", " return \"United Kingdom\"\n", " else:\n", " return country\n", "\n", "\n", "def country_type(country):\n", " if country in eu_countries:\n", " return \"EU\"\n", " elif country==\"China\":\n", " return \"China\"\n", " elif country in [\"Switzerland\", 'Norway','United Kingdom']:\n", " return \"Non-EU associate\"\n", " else:\n", " return \"Other\"\n" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "locations = wos.groupby(record_col)[\"Addresses\"].apply(lambda x: x.str.split('[')).explode().reset_index().drop(columns=\"level_1\")\n", "\n", "\n", "locations = locations[locations[\"Addresses\"]!=\"\"].copy()\n", "locations[\"Address\"] = locations[\"Addresses\"].apply(lambda x:x.split(\"]\")[-1])\n", "locations[\"Authors_of_address\"] = locations[\"Addresses\"].apply(lambda x:x.split(\"]\")[0])" ] }, { "cell_type": "code", "execution_count": 19, "outputs": [ { "data": { "text/plain": "312820" }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(locations)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 20, "outputs": [ { "data": { "text/plain": " UT (Unique WOS ID) Authors_of_address \n0 WOS:000208837000001 Gitzhofer, Francois \\\n1 WOS:000208837000001 Guo, Wei-Chao; Papeleux, Luc; Ponthot, Jean-Ph... \n2 WOS:000208837000001 Guo, Wei-Chao; Zhang, Wei-Hong \n3 WOS:000208837000001 Rauchs, Gast \n4 WOS:000208863600013 Hu, Baolan \n.. ... ... \n95 WOS:000209546000001 Salahuddin, Nawal \n96 WOS:000209546000001 Shrestha, Babu Raja \n97 WOS:000209546000001 Tan, Cheng Cheng \n98 WOS:000209546000001 Tang, Yao-Qing \n99 WOS:000209546000001 Tu, Mei-Lien \n\n Address \n0 Univ Sherbrooke, Dept Chem Engn, Plasma Techno... \n1 Univ Liege, Aerosp & Mech Engn Dept, LTAS MN2L... \n2 Northwestern Polytech Univ, Key Lab Contempora... \n3 Ctr Rech Publ Henri Tudor, Dept Adv Mat & Stru... \n4 Zhejiang Univ, Dept Environm Engn, Hangzhou 31... \n.. ... \n95 Aga Khan Univ & Hosp, Dept Med, Pulm & Crit Ca... \n96 Kathmandu Med Coll Teaching Hosp, Dept Anesthe... \n97 Sultanah Aminah Hosp, Dept Anaesthesia & Inten... \n98 Shanghai Jiao Tong Univ, Sch Med, Ruijin Hosp,... \n99 Chang Gung Mem Hosp, Kaohsiung Med Ctr, Dept R... \n\n[100 rows x 3 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
UT (Unique WOS ID)Authors_of_addressAddress
0WOS:000208837000001Gitzhofer, FrancoisUniv Sherbrooke, Dept Chem Engn, Plasma Techno...
1WOS:000208837000001Guo, Wei-Chao; Papeleux, Luc; Ponthot, Jean-Ph...Univ Liege, Aerosp & Mech Engn Dept, LTAS MN2L...
2WOS:000208837000001Guo, Wei-Chao; Zhang, Wei-HongNorthwestern Polytech Univ, Key Lab Contempora...
3WOS:000208837000001Rauchs, GastCtr Rech Publ Henri Tudor, Dept Adv Mat & Stru...
4WOS:000208863600013Hu, BaolanZhejiang Univ, Dept Environm Engn, Hangzhou 31...
............
95WOS:000209546000001Salahuddin, NawalAga Khan Univ & Hosp, Dept Med, Pulm & Crit Ca...
96WOS:000209546000001Shrestha, Babu RajaKathmandu Med Coll Teaching Hosp, Dept Anesthe...
97WOS:000209546000001Tan, Cheng ChengSultanah Aminah Hosp, Dept Anaesthesia & Inten...
98WOS:000209546000001Tang, Yao-QingShanghai Jiao Tong Univ, Sch Med, Ruijin Hosp,...
99WOS:000209546000001Tu, Mei-LienChang Gung Mem Hosp, Kaohsiung Med Ctr, Dept R...
\n

100 rows × 3 columns

\n
" }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "locations[\"Address\"] = locations[\"Address\"].str.strip().str.strip(\";\")\n", "locations = locations.groupby([record_col,\"Authors_of_address\"])[\"Address\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_2\")\n", "locations.head(100)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 21, "outputs": [], "source": [ "# import dask.dataframe as dd\n", "#\n", "# locations_ddf = dd.from_pandas(locations, npartitions=4) # convert pandas DataFrame to Dask DataFrame\n", "# loc_compute = locations_ddf.groupby([record_col,\"Authors_of_address\"])[\"Address\"].apply(lambda x: x.str.split(';')).explode().compute() # compute the result" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 22, "outputs": [], "source": [ "# locations_test = locations.head(1000)\n", "# locations_test = locations_test.groupby([record_col,\"Authors_of_address\"])[\"Address\"].str.split(';').explode()\n", "# locations_test" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 23, "outputs": [], "source": [ "\n", "# locations[\"Country\"]=locations['Address'].apply(lambda x: extract_location(input_text=x, key='countries'))\n", "locations[\"Country\"]=locations['Address'].apply(lambda x: x.split(\",\")[-1].strip(\" \").strip(\";\").strip(\" \"))\n", "locations[\"Country\"]=locations['Country'].apply(lambda x: country_cleanup(x))\n", "locations[\"City\"]=locations['Address'].apply(lambda x: extract_location(input_text=x, key='cities'))\n", "locations[\"Country_Type\"] = locations[\"Country\"].apply(lambda x: country_type(x))" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 23, "outputs": [], "source": [], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 24, "outputs": [], "source": [ "scope_types = [\"EU\",\"China\",\"Non-EU associate\"]\n", "locations=locations[locations[\"Country_Type\"].isin(scope_types)]" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": " UT (Unique WOS ID) Address \n1 WOS:000208837000001 Univ Liege, Aerosp & Mech Engn Dept, LTAS MN2L... \\\n2 WOS:000208837000001 Northwestern Polytech Univ, Key Lab Contempora... \n3 WOS:000208837000001 Ctr Rech Publ Henri Tudor, Dept Adv Mat & Stru... \n4 WOS:000208863600013 Zhejiang Univ, Dept Environm Engn, Hangzhou 31... \n5 WOS:000208863600013 Delft Univ Technol, Dept Biotechnol, Delft, Ne... \n\n Country City Country_Type Institution \n1 Belgium Liège EU Univ Liege \n2 China Xi’an China Northwestern Polytech Univ \n3 Luxembourg Luxembourg EU Ctr Rech Publ Henri Tudor \n4 China Hangzhou China Zhejiang Univ \n5 Netherlands Delft EU Delft Univ Technol ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
UT (Unique WOS ID)AddressCountryCityCountry_TypeInstitution
1WOS:000208837000001Univ Liege, Aerosp & Mech Engn Dept, LTAS MN2L...BelgiumLiègeEUUniv Liege
2WOS:000208837000001Northwestern Polytech Univ, Key Lab Contempora...ChinaXi’anChinaNorthwestern Polytech Univ
3WOS:000208837000001Ctr Rech Publ Henri Tudor, Dept Adv Mat & Stru...LuxembourgLuxembourgEUCtr Rech Publ Henri Tudor
4WOS:000208863600013Zhejiang Univ, Dept Environm Engn, Hangzhou 31...ChinaHangzhouChinaZhejiang Univ
5WOS:000208863600013Delft Univ Technol, Dept Biotechnol, Delft, Ne...NetherlandsDelftEUDelft Univ Technol
\n
" }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "univ_locations = locations[[record_col,\"Address\",\"Country\",\"City\",\"Country_Type\"]].copy()\n", "univ_locations[\"Institution\"] = univ_locations[\"Address\"].apply(lambda x: x.split(\",\")[0])\n", "univ_locations = univ_locations.drop_duplicates()\n", "univ_locations.head()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": " UT (Unique WOS ID) Country Country_Type author_str_id\n0 WOS:000208837000001 Belgium EU 6079964a4094c607358a130e41e89f90\n1 WOS:000208837000001 Belgium EU 2321037fa90ac94a23b88a79f1c7f454\n2 WOS:000208837000001 Belgium EU 8a1bfa1e7bc52d323f0d9c23a9b74ed3\n3 WOS:000208837000001 China China 6079964a4094c607358a130e41e89f90\n4 WOS:000208837000001 China China 17fb036de6a4db3ba39ccab3d8307c04", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
UT (Unique WOS ID)CountryCountry_Typeauthor_str_id
0WOS:000208837000001BelgiumEU6079964a4094c607358a130e41e89f90
1WOS:000208837000001BelgiumEU2321037fa90ac94a23b88a79f1c7f454
2WOS:000208837000001BelgiumEU8a1bfa1e7bc52d323f0d9c23a9b74ed3
3WOS:000208837000001ChinaChina6079964a4094c607358a130e41e89f90
4WOS:000208837000001ChinaChina17fb036de6a4db3ba39ccab3d8307c04
\n
" }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "author_locations = locations.groupby([record_col,\"Country\",\"Country_Type\"])[\"Authors_of_address\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_3\")\n", "author_locations[\"Author_name\"] = author_locations[\"Authors_of_address\"].str.strip()\n", "author_locations = author_locations.drop(columns=\"Authors_of_address\")\n", "author_locations[\"author_str_id\"] = author_locations[\"Author_name\"].apply(lambda x:''.join(filter(str.isalnum, x.lower())))\n", "author_locations[\"author_str_id\"] = author_locations[\"author_str_id\"].apply(md5hash)\n", "author_locations = author_locations.drop(columns=\"Author_name\")\n", "author_locations.head()" ] }, { "cell_type": "code", "execution_count": 27, "outputs": [ { "data": { "text/plain": " UT (Unique WOS ID) Country Country_Type \n0 WOS:000208837000001 Belgium EU \\\n3 WOS:000208837000001 China China \n4 WOS:000208837000001 China China \n6 WOS:000208863600013 China China \n7 WOS:000208863600013 Netherlands EU \n... ... ... ... \n643323 WOS:000964683900016 Italy EU \n643324 WOS:000964683900016 Italy EU \n643325 WOS:000967389100001 China China \n643326 WOS:000967389100001 Norway Non-EU associate \n643327 WOS:000967389100001 Norway Non-EU associate \n\n author_str_id \n0 6079964a4094c607358a130e41e89f90 \n3 6079964a4094c607358a130e41e89f90 \n4 17fb036de6a4db3ba39ccab3d8307c04 \n6 54c7bc6fe9b77434ca1bf04d763d843b \n7 df81f9da6c8f5c968c16ef0aab1bb8f9 \n... ... \n643323 3c631398a81ab7058d95a0c6418a2c0b \n643324 3c631398a81ab7058d95a0c6418a2c0b \n643325 ce65541a6c334225a9617439f4a95012 \n643326 7c52a53f8d79b1ffd4f2e4cde9548e1d \n643327 7c52a53f8d79b1ffd4f2e4cde9548e1d \n\n[573569 rows x 4 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
UT (Unique WOS ID)CountryCountry_Typeauthor_str_id
0WOS:000208837000001BelgiumEU6079964a4094c607358a130e41e89f90
3WOS:000208837000001ChinaChina6079964a4094c607358a130e41e89f90
4WOS:000208837000001ChinaChina17fb036de6a4db3ba39ccab3d8307c04
6WOS:000208863600013ChinaChina54c7bc6fe9b77434ca1bf04d763d843b
7WOS:000208863600013NetherlandsEUdf81f9da6c8f5c968c16ef0aab1bb8f9
...............
643323WOS:000964683900016ItalyEU3c631398a81ab7058d95a0c6418a2c0b
643324WOS:000964683900016ItalyEU3c631398a81ab7058d95a0c6418a2c0b
643325WOS:000967389100001ChinaChinace65541a6c334225a9617439f4a95012
643326WOS:000967389100001NorwayNon-EU associate7c52a53f8d79b1ffd4f2e4cde9548e1d
643327WOS:000967389100001NorwayNon-EU associate7c52a53f8d79b1ffd4f2e4cde9548e1d
\n

573569 rows × 4 columns

\n
" }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "author_locations[author_locations['author_str_id'].duplicated(False)]" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "author_primary_region = author_locations.sort_values(by=\"Country_Type\").drop_duplicates(subset=[record_col,\"author_str_id\"])\n", "# author_primary_region\n", "\n", "china=author_primary_region[author_primary_region[\"Country_Type\"]==\"China\"][record_col].unique()\n", "eu=author_primary_region[author_primary_region[\"Country_Type\"]==\"EU\"][record_col].unique()\n", "assoc=author_primary_region[author_primary_region[\"Country_Type\"]==\"Non-EU associate\"][record_col].unique()\n", "\n", "\n", "# records that have distinct authors with different country affiliations\n", "valid_scope = wos[((wos[record_col].isin(china))\n", " &\n", " ((wos[record_col].isin(eu))\n", " |\n", " (wos[record_col].isin(assoc))))][record_col].unique()" ] }, { "cell_type": "code", "execution_count": 29, "outputs": [ { "data": { "text/plain": " UT (Unique WOS ID) Country Country_Type \n537692 WOS:000732204600001 China China \\\n204027 WOS:000414089800001 China China \n204028 WOS:000414089800001 China China \n204029 WOS:000414089800001 China China \n204030 WOS:000414090800001 China China \n\n author_str_id \n537692 8fe31cbbd07c639aa4d779688896be81 \n204027 67c7beb18fafd77f1319739fa683bc5e \n204028 7269f0a31fc620688aae12aad9e3cd85 \n204029 ac28aea698a527fb5195d3d24189ea04 \n204030 6c91bf481b6bddc1426d12a18823224a ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
UT (Unique WOS ID)CountryCountry_Typeauthor_str_id
537692WOS:000732204600001ChinaChina8fe31cbbd07c639aa4d779688896be81
204027WOS:000414089800001ChinaChina67c7beb18fafd77f1319739fa683bc5e
204028WOS:000414089800001ChinaChina7269f0a31fc620688aae12aad9e3cd85
204029WOS:000414089800001ChinaChinaac28aea698a527fb5195d3d24189ea04
204030WOS:000414090800001ChinaChina6c91bf481b6bddc1426d12a18823224a
\n
" }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "author_primary_region.head()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of records: 51904\n", "Number of valid cooperation records: 46060\n" ] } ], "source": [ "print(f'Number of records: {len(wos)}')\n", "print(f'Number of valid cooperation records: {len(valid_scope)}')" ] }, { "cell_type": "code", "execution_count": 31, "outputs": [], "source": [ "wos = wos[wos[record_col].isin(valid_scope)]\n", "locations = locations[locations[record_col].isin(valid_scope)]\n", "univ_locations = univ_locations[univ_locations[record_col].isin(valid_scope)]\n", "author_locations = author_locations[author_locations[record_col].isin(valid_scope)]\n", "author_primary_region = author_locations[author_locations[record_col].isin(valid_scope)]" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "affiliations = wos.groupby(record_col)[\"Affiliations\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n", "affiliations[\"Affiliations\"] = affiliations[\"Affiliations\"].str.strip().str.upper().fillna(\"UNKNOWN\")\n", "affiliations = affiliations.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 33, "outputs": [ { "data": { "text/plain": "Affiliations\nCHINESE ACADEMY OF SCIENCES 5616\nUNIVERSITY OF LONDON 2604\nUDICE-FRENCH RESEARCH UNIVERSITIES 2240\nCENTRE NATIONAL DE LA RECHERCHE SCIENTIFIQUE (CNRS) 2170\nTSINGHUA UNIVERSITY 1935\n ... \nUNIVERSITY OF FUKUI 1\nPONTIFICIA UNIVERSIDADE CATOLICA DE GOIAS 1\nINSTITUTE OF ORGANIC CHEMISTRY & BIOCHEMISTRY OF THE CZECH ACADEMY OF SCIENCES 1\nUNIVERSITAS PELITA HARAPAN 1\nFRANCISCUS GASTHUIS 1\nName: count, Length: 7609, dtype: int64" }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "affiliations[\"Affiliations\"].value_counts()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 34, "outputs": [ { "data": { "text/plain": "Institution\nChinese Acad Sci 5749\nTsinghua Univ 2315\nShanghai Jiao Tong Univ 1976\nZhejiang Univ 1806\nPeking Univ 1661\n ... \nNatl Technol Inst Mental Disorders 1\nSeinajoki Univ Appl Sci 1\nJD Intelligent City Res 1\nCAS Ctr Excellence Planetol 1\nKey Lab Intelligent Prevent Med Zhejiang Prov 1\nName: count, Length: 19821, dtype: int64" }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "univ_locations[\"Institution\"].value_counts()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 35, "outputs": [ { "data": { "text/plain": "46060" }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "univ_locations[record_col].nunique()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 36, "outputs": [ { "data": { "text/plain": "46060" }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "affiliations[record_col].nunique()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 37, "outputs": [ { "data": { "text/plain": "202790" }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "univ_locations[\"Institution\"].value_counts().sum()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 38, "outputs": [ { "data": { "text/plain": "268471" }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "affiliations[\"Affiliations\"].value_counts().sum()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": "WoS Categories\n Engineering, Electrical & Electronic 8303\nComputer Science, Artificial Intelligence 6115\n Telecommunications 4661\nComputer Science, Information Systems 4584\nEngineering, Electrical & Electronic 4036\n ... \nCultural Studies 1\n Ornithology 1\n Criminology & Penology 1\nArt 1\n Psychology, Developmental 1\nName: count, Length: 425, dtype: int64" }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wos_cat = wos.groupby(record_col)[\"WoS Categories\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n", "wos_cat[\"WoS Categories\"].value_counts()" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/plain": "Research Areas\nEngineering 18098\nComputer Science 15658\nTelecommunications 5046\nEnvironmental Sciences & Ecology 3246\nImaging Science & Photographic Technology 2947\n ... \nFilm, Radio & Television 2\nArea Studies 2\nCultural Studies 1\nAsian Studies 1\nMusic 1\nName: count, Length: 145, dtype: int64" }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wos_areas = wos.groupby(record_col)[\"Research Areas\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n", "wos_areas[\"Research Areas\"] = wos_areas[\"Research Areas\"].str.strip()\n", "wos_areas[\"Research Areas\"].value_counts()" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": "['Domain_English', 'Field_English', 'SubField_English']" }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[c for c in wos.columns if \"_English\" in c]" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "metrix_levels = [c for c in wos.columns if \"_English\" in c]\n", "for m in metrix_levels:\n", " wos[m] = wos[m].replace({\"article-level classification\":\"Multidisciplinary\"})\n" ] }, { "cell_type": "code", "execution_count": 42, "outputs": [], "source": [], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/plain": " Publication Type Authors \n0 J Yan, Z; Jing, XY; Pedrycz, W \\\n1 J Sookhak, M; Yu, FR; He, Y; Talebian, H; Safa, ... \n2 J Ning, ZL; Dong, PR; Wang, XJ; Guo, L; Rodrigue... \n3 J Wang, XD; Garg, S; Lin, H; Kaddoum, G; Hu, J; ... \n4 J Lu, TG; Chen, XY; McElroy, MB; Nielsen, CP; Wu... \n... ... ... \n51897 J Lai, ZL; Liu, W; Jian, XD; Bacsa, K; Sun, LM; ... \n51898 J Wang, HC; Roussel, P; Denby, B \n51899 J Zhang, R; Alpdogan, S; Kong, SQ; Muhammad, S \n51902 J Chu, WP; Song, Y \n51903 J Lai, CS; Jia, YW; Dong, ZK; Wang, DX; Tao, YS;... \n\n Book Authors Book Editors Book Group Authors \n0 NaN NaN NaN \\\n1 NaN NaN NaN \n2 NaN NaN NaN \n3 NaN NaN NaN \n4 NaN NaN NaN \n... ... ... ... \n51897 NaN NaN NaN \n51898 NaN NaN NaN \n51899 NaN NaN NaN \n51902 NaN NaN NaN \n51903 NaN NaN NaN \n\n Author Full Names \n0 Yan, Zheng; Jing, Xuyang; Pedrycz, Witold \\\n1 Sookhak, Mehdi; Yu, F. Richard; He, Ying; Tale... \n2 Ning, Zhaolong; Dong, Peiran; Wang, Xiaojie; G... \n3 Wang, Xiaoding; Garg, Sahil; Lin, Hui; Kaddoum... \n4 Lu, Tianguang; Chen, Xinyu; McElroy, Michael B... \n... ... \n51897 Lai, Zhilu; Liu, Wei; Jian, Xudong; Bacsa, Kir... \n51898 Wang, Hongcui; Roussel, Pierre; Denby, Bruce \n51899 Zhang, Rui; Alpdogan, Serdar; Kong, Shiqi; Muh... \n51902 Chu, Wenping; Song, Yang \n51903 Lai, Chun Sing; Jia, Youwei; Dong, Zhekang; Wa... \n\n Book Author Full Names Group Authors \n0 NaN NaN \\\n1 NaN NaN \n2 NaN NaN \n3 NaN NaN \n4 NaN NaN \n... ... ... \n51897 NaN NaN \n51898 NaN NaN \n51899 NaN NaN \n51902 NaN NaN \n51903 NaN NaN \n\n Article Title \n0 LEFusing and mining opinions for reputation ge... \\\n1 FOG VEHICULAR COMPUTING Augmentation of Fog Co... \n2 Deep Reinforcement Learning for Intelligent In... \n3 An Intelligent UAV based Data Aggregation Algo... \n4 A Reinforcement Learning-Based Decision System... \n... ... \n51897 Neural modal ordinary differential equations: ... \n51898 Improving ultrasound-based multimodal speech r... \n51899 Application of computer-aided image reconstruc... \n51902 Study on Dynamic Interaction of Railway Pantog... \n51903 A Review of Technical Standards for Smart Cities \n\n Source Title ... \n0 INFORMATION FUSION ... \\\n1 IEEE VEHICULAR TECHNOLOGY MAGAZINE ... \n2 IEEE TRANSACTIONS ON COGNITIVE COMMUNICATIONS ... ... \n3 COMPUTER NETWORKS ... \n4 IEEE TRANSACTIONS ON SMART GRID ... \n... ... ... \n51897 DATA-CENTRIC ENGINEERING ... \n51898 JASA EXPRESS LETTERS ... \n51899 EGYPTIAN JOURNAL OF NEUROSURGERY ... \n51902 VIBRATION ... \n51903 CLEAN TECHNOLOGIES ... \n\n UT (Unique WOS ID) issn_var issn Domain_English \n0 WOS:000394070100013 issn 15662535 Applied Sciences \\\n1 WOS:000408568800008 issn 15566072 Applied Sciences \n2 WOS:000502789700018 issn 23327731 Applied Sciences \n3 WOS:000626758800004 issn 13891286 Applied Sciences \n4 WOS:000641976000028 issn 19493053 Applied Sciences \n... ... ... ... ... \n51897 WOS:000906995300001 eissn NaN Applied Sciences \n51898 WOS:000642230800005 eissn NaN Natural Sciences \n51899 WOS:000807222600001 eissn NaN Health Sciences \n51902 WOS:000661660800001 eissn NaN Applied Sciences \n51903 WOS:000708219500008 eissn NaN Natural Sciences \n\n Field_English \n0 Information & Communication Technologies \\\n1 Information & Communication Technologies \n2 Information & Communication Technologies \n3 Information & Communication Technologies \n4 Enabling & Strategic Technologies \n... ... \n51897 Information & Communication Technologies \n51898 Physics & Astronomy \n51899 Clinical Medicine \n51902 Engineering \n51903 Earth & Environmental Sciences \n\n SubField_English 2.00 SEQ \n0 Artificial Intelligence & Image Processing 31 \\\n1 Networking & Telecommunications 37 \n2 Networking & Telecommunications 37 \n3 Networking & Telecommunications 37 \n4 Energy 14 \n... ... ... \n51897 Artificial Intelligence & Image Processing NaN \n51898 Acoustics NaN \n51899 Neurology & Neurosurgery NaN \n51902 Mechanical Engineering & Transports NaN \n51903 Environmental Sciences NaN \n\n Source_title srcid \n0 Information Fusion 2.609900e+04 \\\n1 IEEE Vehicular Technology Magazine 5.200153e+09 \n2 IEEE Transactions on Cognitive Communications ... 2.110085e+10 \n3 Computer Networks 2.681100e+04 \n4 IEEE Transactions on Smart Grid 1.970017e+10 \n... ... ... \n51897 NaN NaN \n51898 NaN NaN \n51899 NaN NaN \n51902 NaN NaN \n51903 NaN NaN \n\n issn_type \n0 issn1 \n1 issn1 \n2 issn1 \n3 issn1 \n4 issn2 \n... ... \n51897 NaN \n51898 NaN \n51899 NaN \n51902 NaN \n51903 NaN \n\n[46060 rows x 80 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Publication TypeAuthorsBook AuthorsBook EditorsBook Group AuthorsAuthor Full NamesBook Author Full NamesGroup AuthorsArticle TitleSource Title...UT (Unique WOS ID)issn_varissnDomain_EnglishField_EnglishSubField_English2.00 SEQSource_titlesrcidissn_type
0JYan, Z; Jing, XY; Pedrycz, WNaNNaNNaNYan, Zheng; Jing, Xuyang; Pedrycz, WitoldNaNNaNLEFusing and mining opinions for reputation ge...INFORMATION FUSION...WOS:000394070100013issn15662535Applied SciencesInformation & Communication TechnologiesArtificial Intelligence & Image Processing31Information Fusion2.609900e+04issn1
1JSookhak, M; Yu, FR; He, Y; Talebian, H; Safa, ...NaNNaNNaNSookhak, Mehdi; Yu, F. Richard; He, Ying; Tale...NaNNaNFOG VEHICULAR COMPUTING Augmentation of Fog Co...IEEE VEHICULAR TECHNOLOGY MAGAZINE...WOS:000408568800008issn15566072Applied SciencesInformation & Communication TechnologiesNetworking & Telecommunications37IEEE Vehicular Technology Magazine5.200153e+09issn1
2JNing, ZL; Dong, PR; Wang, XJ; Guo, L; Rodrigue...NaNNaNNaNNing, Zhaolong; Dong, Peiran; Wang, Xiaojie; G...NaNNaNDeep Reinforcement Learning for Intelligent In...IEEE TRANSACTIONS ON COGNITIVE COMMUNICATIONS ......WOS:000502789700018issn23327731Applied SciencesInformation & Communication TechnologiesNetworking & Telecommunications37IEEE Transactions on Cognitive Communications ...2.110085e+10issn1
3JWang, XD; Garg, S; Lin, H; Kaddoum, G; Hu, J; ...NaNNaNNaNWang, Xiaoding; Garg, Sahil; Lin, Hui; Kaddoum...NaNNaNAn Intelligent UAV based Data Aggregation Algo...COMPUTER NETWORKS...WOS:000626758800004issn13891286Applied SciencesInformation & Communication TechnologiesNetworking & Telecommunications37Computer Networks2.681100e+04issn1
4JLu, TG; Chen, XY; McElroy, MB; Nielsen, CP; Wu...NaNNaNNaNLu, Tianguang; Chen, Xinyu; McElroy, Michael B...NaNNaNA Reinforcement Learning-Based Decision System...IEEE TRANSACTIONS ON SMART GRID...WOS:000641976000028issn19493053Applied SciencesEnabling & Strategic TechnologiesEnergy14IEEE Transactions on Smart Grid1.970017e+10issn2
..................................................................
51897JLai, ZL; Liu, W; Jian, XD; Bacsa, K; Sun, LM; ...NaNNaNNaNLai, Zhilu; Liu, Wei; Jian, Xudong; Bacsa, Kir...NaNNaNNeural modal ordinary differential equations: ...DATA-CENTRIC ENGINEERING...WOS:000906995300001eissnNaNApplied SciencesInformation & Communication TechnologiesArtificial Intelligence & Image ProcessingNaNNaNNaNNaN
51898JWang, HC; Roussel, P; Denby, BNaNNaNNaNWang, Hongcui; Roussel, Pierre; Denby, BruceNaNNaNImproving ultrasound-based multimodal speech r...JASA EXPRESS LETTERS...WOS:000642230800005eissnNaNNatural SciencesPhysics & AstronomyAcousticsNaNNaNNaNNaN
51899JZhang, R; Alpdogan, S; Kong, SQ; Muhammad, SNaNNaNNaNZhang, Rui; Alpdogan, Serdar; Kong, Shiqi; Muh...NaNNaNApplication of computer-aided image reconstruc...EGYPTIAN JOURNAL OF NEUROSURGERY...WOS:000807222600001eissnNaNHealth SciencesClinical MedicineNeurology & NeurosurgeryNaNNaNNaNNaN
51902JChu, WP; Song, YNaNNaNNaNChu, Wenping; Song, YangNaNNaNStudy on Dynamic Interaction of Railway Pantog...VIBRATION...WOS:000661660800001eissnNaNApplied SciencesEngineeringMechanical Engineering & TransportsNaNNaNNaNNaN
51903JLai, CS; Jia, YW; Dong, ZK; Wang, DX; Tao, YS;...NaNNaNNaNLai, Chun Sing; Jia, Youwei; Dong, Zhekang; Wa...NaNNaNA Review of Technical Standards for Smart CitiesCLEAN TECHNOLOGIES...WOS:000708219500008eissnNaNNatural SciencesEarth & Environmental SciencesEnvironmental SciencesNaNNaNNaNNaN
\n

46060 rows × 80 columns

\n
" }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wos" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/plain": "['Domain_English', 'Field_English', 'SubField_English']" }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "metrix_levels" ] }, { "cell_type": "code", "execution_count": 45, "outputs": [], "source": [ "record_countries = locations[[record_col,\"Country\"]].drop_duplicates()\n", "record_author_locations = author_locations[[record_col,\"author_str_id\",\"Country\"]].drop_duplicates()\n", "record_institution = univ_locations[[record_col,\"Institution\",\"Country\"]].drop_duplicates()\n", "country_types = locations[[\"Country\",\"Country_Type\"]].drop_duplicates()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 46, "outputs": [], "source": [ "# Basic network layout" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 47, "outputs": [], "source": [ "country_collabs = record_countries.merge(record_countries, on=record_col)\n", "country_collabs = country_collabs[country_collabs[\"Country_x\"]!=country_collabs[\"Country_y\"]]\n", "country_collabs[\"weight\"] = 0.5" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 48, "outputs": [], "source": [ "inst_collabs = record_institution.merge(record_institution, on=record_col)\n", "inst_collabs = inst_collabs[inst_collabs[\"Institution_x\"]!=inst_collabs[\"Institution_y\"]]\n", "inst_collabs[\"weight\"] = 0.5" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 49, "outputs": [ { "data": { "text/plain": "Index(['Publication Type', 'Authors', 'Book Authors', 'Book Editors',\n 'Book Group Authors', 'Author Full Names', 'Book Author Full Names',\n 'Group Authors', 'Article Title', 'Source Title', 'Book Series Title',\n 'Book Series Subtitle', 'Language', 'Document Type', 'Conference Title',\n 'Conference Date', 'Conference Location', 'Conference Sponsor',\n 'Conference Host', 'Author Keywords', 'Keywords Plus', 'Abstract',\n 'Addresses', 'Affiliations', 'Reprint Addresses', 'Email Addresses',\n 'Researcher Ids', 'ORCIDs', 'Funding Orgs', 'Funding Name Preferred',\n 'Funding Text', 'Cited References', 'Cited Reference Count',\n 'Times Cited, WoS Core', 'Times Cited, All Databases',\n '180 Day Usage Count', 'Since 2013 Usage Count', 'Publisher',\n 'Publisher City', 'Publisher Address', 'ISSN', 'eISSN', 'ISBN',\n 'Journal Abbreviation', 'Journal ISO Abbreviation', 'Publication Date',\n 'Publication Year', 'Volume', 'Issue', 'Part Number', 'Supplement',\n 'Special Issue', 'Meeting Abstract', 'Start Page', 'End Page',\n 'Article Number', 'DOI', 'DOI Link', 'Book DOI', 'Early Access Date',\n 'Number of Pages', 'WoS Categories', 'Web of Science Index',\n 'Research Areas', 'IDS Number', 'Pubmed Id', 'Open Access Designations',\n 'Highly Cited Status', 'Hot Paper Status', 'Date of Export',\n 'UT (Unique WOS ID)', 'issn_var', 'issn', 'Domain_English',\n 'Field_English', 'SubField_English', '2.00 SEQ', 'Source_title',\n 'srcid', 'issn_type'],\n dtype='object')" }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wos.columns" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 50, "outputs": [ { "data": { "text/plain": "['Authors',\n 'Book Authors',\n 'Book Editors',\n 'Book Group Authors',\n 'Author Full Names',\n 'Book Author Full Names',\n 'Group Authors',\n 'Addresses',\n 'Reprint Addresses',\n 'Email Addresses',\n 'Researcher Ids',\n 'ORCIDs',\n 'Publisher Address',\n '2.00 SEQ']" }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drop_cols = [ws for ws in wos.columns if ((\"uthor\" in ws or \"ddress\" in ws or \"ORCID\" in\n", " ws or \"esearcher\" in ws or \"ditor\" in ws or \"name\" in ws or 'SEQ' in ws) and \"eyword\" not in ws)]\n", "drop_cols" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 51, "outputs": [], "source": [ "outdir=\"wos_processed_data\"" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 52, "outputs": [], "source": [ "os.makedirs(outdir, exist_ok=True)\n", "\n", "wos.drop(columns=drop_cols).to_excel(f\"{outdir}/wos_processed.xlsx\", index=False)\n", "\n", "record_countries.to_excel(f\"{outdir}/wos_countries.xlsx\", index=False)\n", "\n", "record_author_locations.to_excel(f\"{outdir}/wos_author_locations.xlsx\", index=False)\n", "\n", "record_institution.to_excel(f\"{outdir}/wos_institution_locations.xlsx\", index=False)\n", "\n", "kw_df.to_excel(f\"{outdir}/wos_keywords.xlsx\", index=False)\n", "\n", "country_types.to_excel(f\"{outdir}/wos_country_types.xlsx\", index=False)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 53, "outputs": [], "source": [ "wos.drop(columns=drop_cols).to_csv(f\"{outdir}/wos_processed.csv\", index=False, sep='\\t')\n", "\n", "record_countries.to_csv(f\"{outdir}/wos_countries.csv\", index=False, sep='\\t')\n", "\n", "record_author_locations.to_csv(f\"{outdir}/wos_author_locations.csv\", index=False, sep='\\t')\n", "\n", "record_institution.to_csv(f\"{outdir}/wos_institution_locations.csv\", index=False, sep='\\t')\n", "\n", "kw_df.to_csv(f\"{outdir}/wos_keywords.csv\", index=False, sep='\\t')\n", "\n", "country_types.to_csv(f\"{outdir}/wos_country_types.csv\", index=False, sep='\\t')\n", "\n", "inst_collabs.to_csv(f\"{outdir}/wos_inst_collabs.csv\", index=False, sep='\\t')\n", "\n", "country_collabs.to_csv(f\"{outdir}/wos_country_collabs.csv\", index=False, sep='\\t')" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 54, "outputs": [], "source": [ "wos_areas.to_csv(f\"{outdir}/wos_research_areas.csv\", index=False, sep='\\t')\n", "\n", "wos_subcat.to_csv(f\"{outdir}/wos_categories.csv\", index=False, sep='\\t')" ], "metadata": { "collapsed": false } } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16" } }, "nbformat": 4, "nbformat_minor": 1 }