In [1]:
import numpy as np
import pandas as pd
import os
import shutil
from flashgeotext.geotext import GeoText
import re

In [2]:
import hashlib

def md5hash(s: str):
    return hashlib.md5(s.encode('utf-8')).hexdigest()

In [3]:
record_col="UT (Unique WOS ID)"
outfile = r"C:\Users\radvanyi\PycharmProjects\ZSI_analytics\WOS\wos_extract\wos_records_concat.csv"

In [4]:
wos = pd.read_csv(outfile, sep="\t",low_memory=False)

wos = wos[((wos["Publication Year"]<2023)&(wos["Publication Year"]>2010))].copy()
print(f'Number of initial (valid interval) records: {len(wos)}')

metrix = pd.read_excel("sm_journal_classification.xlsx", sheet_name="Journal_Classification")


metrix = metrix.set_index([c for c in metrix.columns if "issn" not in c]).stack().reset_index()
metrix = metrix.rename(columns={'level_6':"issn_type", 0:"issn"})
metrix["issn"]=metrix["issn"].str.replace("-","").str.lower().str.strip()

wos["issn"] = wos["ISSN"].str.replace("-","").str.lower().str.strip()
wos["eissn"] = wos["eISSN"].str.replace("-","").str.lower().str.strip()
wos = wos.set_index([c for c in wos.columns if "issn" not in c]).stack().reset_index()
wos = wos.rename(columns={'level_71':"issn_var", 0:"issn"})

wos_merge = wos.merge(metrix, on="issn", how="left")



wos_indexed = wos_merge[~wos_merge["Domain_English"].isna()]
wos_unindexed = wos_merge[~wos_merge[record_col].isin(wos_indexed[record_col])]


wos_unindexed = wos_unindexed.sort_values(by=["issn_var"],ascending=False).drop_duplicates(subset=record_col)
wos = wos_indexed.sort_values(by=["issn_var"],ascending=False).drop_duplicates(subset=record_col)

wos_postmerge = wos.copy()
print(f'Number of METRIX filtered records: {len(wos)}')
print(f'Number of unindexed records: {len(wos_unindexed)}')

# drop entries not indexed by metrix
# drop duplicates (based on doi)
wos = wos[~((~wos["DOI"].isna())&(wos["DOI"].duplicated(False)))]
wos = wos.drop_duplicates(subset=["Publication Type","Document Type","Authors","Article Title","Source Title","Publication Year"])
print(f'Number of filtered records (dropping duplicates): {len(wos)}')

Number of initial (valid interval) records: 56196
Number of METRIX filtered records: 49854
Number of unindexed records: 2984
Number of filtered records (dropping duplicates): 49839


In [5]:
wos["Domain_English"].value_counts()

Domain_English
Applied Sciences                31871
Natural Sciences                 9542
Health Sciences                  5942
Economic & Social Sciences       1468
article-level classification      940
Arts & Humanities                  76
Name: count, dtype: int64

In [6]:
wos_classifier = wos[["WoS Categories","Research Areas"]+list(metrix.columns)].copy().drop_duplicates()
wos_classifier = wos_classifier.groupby(["WoS Categories","Research Areas"], as_index=False)[["Domain_English","Field_English","SubField_English"]].agg(
    lambda x: pd.Series.mode(x)[0])

In [7]:
wos_to_reindex = wos_unindexed.drop(columns=list(metrix.columns))
wos_found = wos_to_reindex.merge(wos_classifier, on=["WoS Categories","Research Areas"], how="inner")
# wos_found = wos_to_reindex.merge(wos_classifier, on="Research Areas", how="inner")
# # wos_found = wos_to_reindex.merge(wos_classifier, on="WoS Categories", how="inner")
wos_stillost = wos_unindexed[~wos_unindexed[record_col].isin(wos_found[record_col])]

print("Found:", wos_found[record_col].nunique(),"\nLost forever:", wos_stillost[record_col].nunique())

Found: 2065 
Lost forever: 919


In [8]:
wos = pd.concat([wos,wos_found], ignore_index=True)
print(f'Number of records (after remerge): {len(wos)}')

Number of records (after remerge): 51904


In [9]:
wos["Domain_English"].value_counts()

Domain_English
Applied Sciences                33720
Natural Sciences                 9617
Health Sciences                  6002
Economic & Social Sciences       1533
article-level classification      955
Arts & Humanities                  77
Name: count, dtype: int64

In [10]:
wos_cat = wos.groupby(record_col)["WoS Categories"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns="level_1")
wos_cat["WoS Categories"] = wos_cat["WoS Categories"].str.strip()
wos_cat["WoS Categories"].value_counts()

WoS Categories
Engineering, Electrical & Electronic         13661
Computer Science, Artificial Intelligence     7760
Computer Science, Information Systems         6481
Telecommunications                            5560
Computer Science, Theory & Methods            3597
                                             ...  
Music                                            1
Cultural Studies                                 1
Psychology, Psychoanalysis                       1
Asian Studies                                    1
Andrology                                        1
Name: count, Length: 236, dtype: int64

In [11]:
wos_subcat = wos_cat.copy()
wos_subcat[['WoS Category', 'WoS SubCategory']] = wos_subcat["WoS Categories"].str.split(",", expand = True, n=1)
for c in ['WoS Category', 'WoS SubCategory',"WoS Categories"]:
    wos_subcat[c] = wos_subcat[c].str.strip()
wos_subcat.drop_duplicates(subset=[record_col,'WoS Category'])["WoS Category"].value_counts()

WoS Category
Engineering                                  20126
Computer Science                             17613
Telecommunications                            5560
Imaging Science & Photographic Technology     3295
Automation & Control Systems                  3232
                                             ...  
Music                                            1
Andrology                                        1
Literature                                       1
Cultural Studies                                 1
Asian Studies                                    1
Name: count, Length: 177, dtype: int64

In [12]:
wos_areas = wos.groupby(record_col)["Research Areas"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns="level_1")
wos_areas["Research Areas"] = wos_areas["Research Areas"].str.strip()
wos_areas["Research Areas"].value_counts()

Research Areas
Engineering                                  20176
Computer Science                             17613
Telecommunications                            5560
Environmental Sciences & Ecology              3732
Imaging Science & Photographic Technology     3295
                                             ...  
Literature                                       1
Women's Studies                                  1
Cultural Studies                                 1
Asian Studies                                    1
Music                                            1
Name: count, Length: 147, dtype: int64

In [13]:
wos[["Article Title","Keywords Plus","Author Keywords"]].sample(100)

Unnamed: 0,Article Title,Keywords Plus,Author Keywords
24862,Kinematic self-calibration of non-contact five...,POSE MEASUREMENT; PARALLEL; MANIPULATOR,kinematic self-calibration; five-axis measurin...
6623,Optimizing Color Assignment for Perception of ...,OPTIMIZATION; DIFFERENCE,Color perception; visual design; scatterplots
20728,CFD modeling of biomass combustion and gasific...,DISCRETE PARTICLE SIMULATION; STEAM GASIFICATI...,Biomass combustion and gasification; CFD simul...
41245,Redshift-space distortions in f(R) gravity,DARK ENERGY SURVEY; REAL-SPACE; GROWTH; DENSIT...,cosmology: theory; dark energy; large-scale st...
12373,Executable Knowledge Graphs for Machine Learni...,,Knowledge graph; Machine learning; Data analyt...
...,...,...,...
11117,Biochar amendment mitigated N2O emissions from...,NITROUS-OXIDE EMISSIONS; NITRIFIER DENITRIFICA...,Biochar; Nitrite accumulation; Nitrous oxide; ...
47975,Adaptive Noise Reduction for Sound Event Detec...,NONNEGATIVE MATRIX FACTORIZATION; SOURCE SEPAR...,sound event detection; non-stationary noise; w...
4599,NVM Storage in IoT Devices: Opportunities and ...,ENCRYPTED DATA; ACCESS-CONTROL; INTERNET; MEMO...,IoT; NVM; storage system; energy efficiency; s...
40609,FABNet: Fusion Attention Block and Transfer Le...,NUCLEI,Cancer; Analytical models; Transfer learning; ...


In [14]:
kw_df = pd.DataFrame()
for c in ["Keywords Plus","Author Keywords"]:
    kwp = wos.groupby(record_col)[c].apply(lambda x: x.str.split(';')).explode().str.strip().str.upper()
    kwp.name = 'keyword_all'
    kw_df = pd.concat([kwp.reset_index(),kw_df],ignore_index=True)
kw_df = kw_df[~kw_df["keyword_all"].isna()].copy().drop(columns="level_1").drop_duplicates()
kw_df["keyword_all"] = kw_df["keyword_all"].apply(lambda x: re.sub("[\(\[].*?[\)\]]", "", x))
kw_df.head(100)

Unnamed: 0,UT (Unique WOS ID),keyword_all
0,WOS:000208837000001,NANOINDENTATION
1,WOS:000208837000001,HARDNESS
2,WOS:000208837000001,PLASMA-SPRAYED COATING
3,WOS:000208837000001,INVERSE ANALYSIS
4,WOS:000208837000001,NUMERICAL METHOD
...,...,...
97,WOS:000209571700012,PERSONALIZED MEDICINE
98,WOS:000209571700012,COMPLEX NETWORK
99,WOS:000209571700012,CLINICAL PHENOTYPE NETWORK
100,WOS:000209571700012,TRADITIONAL CHINESE MEDICINE


In [15]:
wos_kwd_concat = kw_df.groupby(record_col, as_index=False).agg({'keyword_all': '; '.join})
wos_kwd_concat.head()

Unnamed: 0,UT (Unique WOS ID),keyword_all
0,WOS:000208837000001,NANOINDENTATION; HARDNESS; PLASMA-SPRAYED COAT...
1,WOS:000208863600013,COMPARATIVE GENOMICS; ANAMMOX; KUENENIA STUTTG...
2,WOS:000208863600266,ANME; PYROSEQUENCING; AOM; COMMUNITY STRUCTURE...
3,WOS:000208863900217,DEFAULT MODE NETWORK; EFFECTIVE CONNECTIVITY; ...
4,WOS:000208935500007,ESTIMATION; SOURCE TERM; QUASI STEADY; THE ITE...


In [16]:
wos.columns

Index(['Publication Type', 'Authors', 'Book Authors', 'Book Editors',
       'Book Group Authors', 'Author Full Names', 'Book Author Full Names',
       'Group Authors', 'Article Title', 'Source Title', 'Book Series Title',
       'Book Series Subtitle', 'Language', 'Document Type', 'Conference Title',
       'Conference Date', 'Conference Location', 'Conference Sponsor',
       'Conference Host', 'Author Keywords', 'Keywords Plus', 'Abstract',
       'Addresses', 'Affiliations', 'Reprint Addresses', 'Email Addresses',
       'Researcher Ids', 'ORCIDs', 'Funding Orgs', 'Funding Name Preferred',
       'Funding Text', 'Cited References', 'Cited Reference Count',
       'Times Cited, WoS Core', 'Times Cited, All Databases',
       '180 Day Usage Count', 'Since 2013 Usage Count', 'Publisher',
       'Publisher City', 'Publisher Address', 'ISSN', 'eISSN', 'ISBN',
       'Journal Abbreviation', 'Journal ISO Abbreviation', 'Publication Date',
       'Publication Year', 'Volume', 'Issue', 'Pa

In [17]:
geotext = GeoText()

def extract_location(input_text, key='countries'):
    anomalies = {"Malta":"Malta",
                 "Mongolia":"Mongolia",
                 "Quatar":"Qatar",
                 "Qatar":"Qatar",
                 "Ethiop":"Ethiopia",
                 "Nigeria":"Nigeria",
                 "BELAR":"Belarus",
                 "Venezuela":"Venezuela",
                 "Cyprus":"Cyprus",
                 "Ecuador":"Ecuador",
                 "U Arab":"United Arab Emirates",
                 "Syria":"Syria",
                 "Uganda":"Uganda",
                 "Yemen":"Yemen",
                 "Mali":"Mali",
                 "Senegal":"Senegal",
                 "Vatican":"Vatican",
                 "Uruguay":"Uruguay",
                 "Panama":"Panama",
                 "Fiji":"Fiji",
                 "Faroe":"Faroe Islands",
                 "Macedonia":"Macedonia",
                 'Mozambique':'Mozambique',
                 "Kuwait":"Kuwait",
                 "Libya":"Libya",
                 "Turkiy":"Turkey",
                 "Liberia":"Liberia",
                 "Namibia":"Namibia",
                 "Ivoire":"Ivory Coast",
                 "Guatemala":"Gutemala",
                 "Paraguay":"Paraguay",
                 "Honduras":"Honduras",
                 "Nicaragua":"Nicaragua",
                 "Trinidad":"Trinidad & Tobago",
                 "Liechtenstein":"Liechtenstein",
                 "Greenland":"Denmark"}

    extracted = geotext.extract(input_text=input_text)
    found = extracted[key].keys()
    if len(sorted(found))>0:
        return sorted(found)[0]
    elif key=='countries':
        for i  in ['Scotland','Wales','England', 'N Ireland']:
            if i in input_text:
                return 'United Kingdom'
        for j in anomalies.keys():
            if j in input_text:
                return anomalies.get(j)
    else:
        return None

with open('../eu_members.txt',"r") as f:
    eu_countries=f.readline().split(",")
    eu_countries=[i.strip() for i in eu_countries]

def country_cleanup(country):
    if "USA" in country:
        return "USA"
    elif "China" in country:
        return "China"
    elif country in ["England", "Northern Ireland", "Wales", "Scotland","N Ireland"]:
        return "United Kingdom"
    else:
        return country


def country_type(country):
    if country in eu_countries:
        return "EU"
    elif country=="China":
        return "China"
    elif country in ["Switzerland", 'Norway','United Kingdom']:
        return "Non-EU associate"
    else:
        return "Other"


In [18]:
locations = wos.groupby(record_col)["Addresses"].apply(lambda x: x.str.split('[')).explode().reset_index().drop(columns="level_1")


locations = locations[locations["Addresses"]!=""].copy()
locations["Address"] = locations["Addresses"].apply(lambda x:x.split("]")[-1])
locations["Authors_of_address"] = locations["Addresses"].apply(lambda x:x.split("]")[0])

In [19]:
len(locations)

312820

In [20]:
locations["Address"] = locations["Address"].str.strip().str.strip(";")
locations = locations.groupby([record_col,"Authors_of_address"])["Address"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns="level_2")
locations.head(100)

Unnamed: 0,UT (Unique WOS ID),Authors_of_address,Address
0,WOS:000208837000001,"Gitzhofer, Francois","Univ Sherbrooke, Dept Chem Engn, Plasma Techno..."
1,WOS:000208837000001,"Guo, Wei-Chao; Papeleux, Luc; Ponthot, Jean-Ph...","Univ Liege, Aerosp & Mech Engn Dept, LTAS MN2L..."
2,WOS:000208837000001,"Guo, Wei-Chao; Zhang, Wei-Hong","Northwestern Polytech Univ, Key Lab Contempora..."
3,WOS:000208837000001,"Rauchs, Gast","Ctr Rech Publ Henri Tudor, Dept Adv Mat & Stru..."
4,WOS:000208863600013,"Hu, Baolan","Zhejiang Univ, Dept Environm Engn, Hangzhou 31..."
...,...,...,...
95,WOS:000209546000001,"Salahuddin, Nawal","Aga Khan Univ & Hosp, Dept Med, Pulm & Crit Ca..."
96,WOS:000209546000001,"Shrestha, Babu Raja","Kathmandu Med Coll Teaching Hosp, Dept Anesthe..."
97,WOS:000209546000001,"Tan, Cheng Cheng","Sultanah Aminah Hosp, Dept Anaesthesia & Inten..."
98,WOS:000209546000001,"Tang, Yao-Qing","Shanghai Jiao Tong Univ, Sch Med, Ruijin Hosp,..."


In [21]:
# import dask.dataframe as dd
#
# locations_ddf = dd.from_pandas(locations, npartitions=4)  # convert pandas DataFrame to Dask DataFrame
# loc_compute = locations_ddf.groupby([record_col,"Authors_of_address"])["Address"].apply(lambda x: x.str.split(';')).explode().compute()  # compute the result

In [22]:
# locations_test = locations.head(1000)
# locations_test = locations_test.groupby([record_col,"Authors_of_address"])["Address"].str.split(';').explode()
# locations_test

In [23]:

# locations["Country"]=locations['Address'].apply(lambda x: extract_location(input_text=x, key='countries'))
locations["Country"]=locations['Address'].apply(lambda x: x.split(",")[-1].strip(" ").strip(";").strip(" "))
locations["Country"]=locations['Country'].apply(lambda x: country_cleanup(x))
locations["City"]=locations['Address'].apply(lambda x: extract_location(input_text=x, key='cities'))
locations["Country_Type"] = locations["Country"].apply(lambda x: country_type(x))

In [24]:
scope_types = ["EU","China","Non-EU associate"]
locations=locations[locations["Country_Type"].isin(scope_types)]

In [25]:
univ_locations = locations[[record_col,"Address","Country","City","Country_Type"]].copy()
univ_locations["Institution"] = univ_locations["Address"].apply(lambda x: x.split(",")[0])
univ_locations = univ_locations.drop_duplicates()
univ_locations.head()

Unnamed: 0,UT (Unique WOS ID),Address,Country,City,Country_Type,Institution
1,WOS:000208837000001,"Univ Liege, Aerosp & Mech Engn Dept, LTAS MN2L...",Belgium,Liège,EU,Univ Liege
2,WOS:000208837000001,"Northwestern Polytech Univ, Key Lab Contempora...",China,Xi’an,China,Northwestern Polytech Univ
3,WOS:000208837000001,"Ctr Rech Publ Henri Tudor, Dept Adv Mat & Stru...",Luxembourg,Luxembourg,EU,Ctr Rech Publ Henri Tudor
4,WOS:000208863600013,"Zhejiang Univ, Dept Environm Engn, Hangzhou 31...",China,Hangzhou,China,Zhejiang Univ
5,WOS:000208863600013,"Delft Univ Technol, Dept Biotechnol, Delft, Ne...",Netherlands,Delft,EU,Delft Univ Technol


In [26]:
author_locations = locations.groupby([record_col,"Country","Country_Type"])["Authors_of_address"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns="level_3")
author_locations["Author_name"] = author_locations["Authors_of_address"].str.strip()
author_locations = author_locations.drop(columns="Authors_of_address")
author_locations["author_str_id"] = author_locations["Author_name"].apply(lambda x:''.join(filter(str.isalnum, x.lower())))
author_locations["author_str_id"] = author_locations["author_str_id"].apply(md5hash)
author_locations = author_locations.drop(columns="Author_name")
author_locations.head()

Unnamed: 0,UT (Unique WOS ID),Country,Country_Type,author_str_id
0,WOS:000208837000001,Belgium,EU,6079964a4094c607358a130e41e89f90
1,WOS:000208837000001,Belgium,EU,2321037fa90ac94a23b88a79f1c7f454
2,WOS:000208837000001,Belgium,EU,8a1bfa1e7bc52d323f0d9c23a9b74ed3
3,WOS:000208837000001,China,China,6079964a4094c607358a130e41e89f90
4,WOS:000208837000001,China,China,17fb036de6a4db3ba39ccab3d8307c04


In [27]:
author_locations[author_locations['author_str_id'].duplicated(False)]

Unnamed: 0,UT (Unique WOS ID),Country,Country_Type,author_str_id
0,WOS:000208837000001,Belgium,EU,6079964a4094c607358a130e41e89f90
3,WOS:000208837000001,China,China,6079964a4094c607358a130e41e89f90
4,WOS:000208837000001,China,China,17fb036de6a4db3ba39ccab3d8307c04
6,WOS:000208863600013,China,China,54c7bc6fe9b77434ca1bf04d763d843b
7,WOS:000208863600013,Netherlands,EU,df81f9da6c8f5c968c16ef0aab1bb8f9
...,...,...,...,...
643323,WOS:000964683900016,Italy,EU,3c631398a81ab7058d95a0c6418a2c0b
643324,WOS:000964683900016,Italy,EU,3c631398a81ab7058d95a0c6418a2c0b
643325,WOS:000967389100001,China,China,ce65541a6c334225a9617439f4a95012
643326,WOS:000967389100001,Norway,Non-EU associate,7c52a53f8d79b1ffd4f2e4cde9548e1d


In [28]:
author_primary_region = author_locations.sort_values(by="Country_Type").drop_duplicates(subset=[record_col,"author_str_id"])
# author_primary_region

china=author_primary_region[author_primary_region["Country_Type"]=="China"][record_col].unique()
eu=author_primary_region[author_primary_region["Country_Type"]=="EU"][record_col].unique()
assoc=author_primary_region[author_primary_region["Country_Type"]=="Non-EU associate"][record_col].unique()


# records that have distinct authors with different country affiliations
valid_scope = wos[((wos[record_col].isin(china))
         &
         ((wos[record_col].isin(eu))
         |
         (wos[record_col].isin(assoc))))][record_col].unique()

In [29]:
author_primary_region.head()

Unnamed: 0,UT (Unique WOS ID),Country,Country_Type,author_str_id
537692,WOS:000732204600001,China,China,8fe31cbbd07c639aa4d779688896be81
204027,WOS:000414089800001,China,China,67c7beb18fafd77f1319739fa683bc5e
204028,WOS:000414089800001,China,China,7269f0a31fc620688aae12aad9e3cd85
204029,WOS:000414089800001,China,China,ac28aea698a527fb5195d3d24189ea04
204030,WOS:000414090800001,China,China,6c91bf481b6bddc1426d12a18823224a


In [30]:
print(f'Number of records: {len(wos)}')
print(f'Number of valid cooperation records: {len(valid_scope)}')

Number of records: 51904
Number of valid cooperation records: 46060


In [31]:
wos = wos[wos[record_col].isin(valid_scope)]
locations = locations[locations[record_col].isin(valid_scope)]
univ_locations = univ_locations[univ_locations[record_col].isin(valid_scope)]
author_locations = author_locations[author_locations[record_col].isin(valid_scope)]
author_primary_region = author_locations[author_locations[record_col].isin(valid_scope)]

In [32]:
affiliations = wos.groupby(record_col)["Affiliations"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns="level_1")
affiliations["Affiliations"] = affiliations["Affiliations"].str.strip().str.upper().fillna("UNKNOWN")
affiliations = affiliations.drop_duplicates()

In [33]:
affiliations["Affiliations"].value_counts()

Affiliations
CHINESE ACADEMY OF SCIENCES                                                       5616
UNIVERSITY OF LONDON                                                              2604
UDICE-FRENCH RESEARCH UNIVERSITIES                                                2240
CENTRE NATIONAL DE LA RECHERCHE SCIENTIFIQUE (CNRS)                               2170
TSINGHUA UNIVERSITY                                                               1935
                                                                                  ... 
UNIVERSITY OF FUKUI                                                                  1
PONTIFICIA UNIVERSIDADE CATOLICA DE GOIAS                                            1
INSTITUTE OF ORGANIC CHEMISTRY & BIOCHEMISTRY OF THE CZECH ACADEMY OF SCIENCES       1
UNIVERSITAS PELITA HARAPAN                                                           1
FRANCISCUS GASTHUIS                                                                  1
Name: count, Length: 7609, dty

In [34]:
univ_locations["Institution"].value_counts()

Institution
Chinese Acad Sci                                 5749
Tsinghua Univ                                    2315
Shanghai Jiao Tong Univ                          1976
Zhejiang Univ                                    1806
Peking Univ                                      1661
                                                 ... 
Natl Technol Inst Mental Disorders                  1
Seinajoki Univ Appl Sci                             1
JD Intelligent City Res                             1
CAS Ctr Excellence Planetol                         1
Key Lab Intelligent Prevent Med Zhejiang Prov       1
Name: count, Length: 19821, dtype: int64

In [35]:
univ_locations[record_col].nunique()

46060

In [36]:
affiliations[record_col].nunique()

46060

In [37]:
univ_locations["Institution"].value_counts().sum()

202790

In [38]:
affiliations["Affiliations"].value_counts().sum()

268471

In [39]:
wos_cat = wos.groupby(record_col)["WoS Categories"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns="level_1")
wos_cat["WoS Categories"].value_counts()

WoS Categories
 Engineering, Electrical & Electronic        8303
Computer Science, Artificial Intelligence    6115
 Telecommunications                          4661
Computer Science, Information Systems        4584
Engineering, Electrical & Electronic         4036
                                             ... 
Cultural Studies                                1
 Ornithology                                    1
 Criminology & Penology                         1
Art                                             1
 Psychology, Developmental                      1
Name: count, Length: 425, dtype: int64

In [40]:
wos_areas = wos.groupby(record_col)["Research Areas"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns="level_1")
wos_areas["Research Areas"] = wos_areas["Research Areas"].str.strip()
wos_areas["Research Areas"].value_counts()

Research Areas
Engineering                                  18098
Computer Science                             15658
Telecommunications                            5046
Environmental Sciences & Ecology              3246
Imaging Science & Photographic Technology     2947
                                             ...  
Film, Radio & Television                         2
Area Studies                                     2
Cultural Studies                                 1
Asian Studies                                    1
Music                                            1
Name: count, Length: 145, dtype: int64

In [41]:
[c for c in wos.columns if "_English" in c]

['Domain_English', 'Field_English', 'SubField_English']

In [42]:
metrix_levels = [c for c in wos.columns if "_English" in c]
for m in metrix_levels:
    wos[m] = wos[m].replace({"article-level classification":"Multidisciplinary"})


In [43]:
wos

Unnamed: 0,Publication Type,Authors,Book Authors,Book Editors,Book Group Authors,Author Full Names,Book Author Full Names,Group Authors,Article Title,Source Title,...,UT (Unique WOS ID),issn_var,issn,Domain_English,Field_English,SubField_English,2.00 SEQ,Source_title,srcid,issn_type
0,J,"Yan, Z; Jing, XY; Pedrycz, W",,,,"Yan, Zheng; Jing, Xuyang; Pedrycz, Witold",,,LEFusing and mining opinions for reputation ge...,INFORMATION FUSION,...,WOS:000394070100013,issn,15662535,Applied Sciences,Information & Communication Technologies,Artificial Intelligence & Image Processing,31,Information Fusion,2.609900e+04,issn1
1,J,"Sookhak, M; Yu, FR; He, Y; Talebian, H; Safa, ...",,,,"Sookhak, Mehdi; Yu, F. Richard; He, Ying; Tale...",,,FOG VEHICULAR COMPUTING Augmentation of Fog Co...,IEEE VEHICULAR TECHNOLOGY MAGAZINE,...,WOS:000408568800008,issn,15566072,Applied Sciences,Information & Communication Technologies,Networking & Telecommunications,37,IEEE Vehicular Technology Magazine,5.200153e+09,issn1
2,J,"Ning, ZL; Dong, PR; Wang, XJ; Guo, L; Rodrigue...",,,,"Ning, Zhaolong; Dong, Peiran; Wang, Xiaojie; G...",,,Deep Reinforcement Learning for Intelligent In...,IEEE TRANSACTIONS ON COGNITIVE COMMUNICATIONS ...,...,WOS:000502789700018,issn,23327731,Applied Sciences,Information & Communication Technologies,Networking & Telecommunications,37,IEEE Transactions on Cognitive Communications ...,2.110085e+10,issn1
3,J,"Wang, XD; Garg, S; Lin, H; Kaddoum, G; Hu, J; ...",,,,"Wang, Xiaoding; Garg, Sahil; Lin, Hui; Kaddoum...",,,An Intelligent UAV based Data Aggregation Algo...,COMPUTER NETWORKS,...,WOS:000626758800004,issn,13891286,Applied Sciences,Information & Communication Technologies,Networking & Telecommunications,37,Computer Networks,2.681100e+04,issn1
4,J,"Lu, TG; Chen, XY; McElroy, MB; Nielsen, CP; Wu...",,,,"Lu, Tianguang; Chen, Xinyu; McElroy, Michael B...",,,A Reinforcement Learning-Based Decision System...,IEEE TRANSACTIONS ON SMART GRID,...,WOS:000641976000028,issn,19493053,Applied Sciences,Enabling & Strategic Technologies,Energy,14,IEEE Transactions on Smart Grid,1.970017e+10,issn2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51897,J,"Lai, ZL; Liu, W; Jian, XD; Bacsa, K; Sun, LM; ...",,,,"Lai, Zhilu; Liu, Wei; Jian, Xudong; Bacsa, Kir...",,,Neural modal ordinary differential equations: ...,DATA-CENTRIC ENGINEERING,...,WOS:000906995300001,eissn,,Applied Sciences,Information & Communication Technologies,Artificial Intelligence & Image Processing,,,,
51898,J,"Wang, HC; Roussel, P; Denby, B",,,,"Wang, Hongcui; Roussel, Pierre; Denby, Bruce",,,Improving ultrasound-based multimodal speech r...,JASA EXPRESS LETTERS,...,WOS:000642230800005,eissn,,Natural Sciences,Physics & Astronomy,Acoustics,,,,
51899,J,"Zhang, R; Alpdogan, S; Kong, SQ; Muhammad, S",,,,"Zhang, Rui; Alpdogan, Serdar; Kong, Shiqi; Muh...",,,Application of computer-aided image reconstruc...,EGYPTIAN JOURNAL OF NEUROSURGERY,...,WOS:000807222600001,eissn,,Health Sciences,Clinical Medicine,Neurology & Neurosurgery,,,,
51902,J,"Chu, WP; Song, Y",,,,"Chu, Wenping; Song, Yang",,,Study on Dynamic Interaction of Railway Pantog...,VIBRATION,...,WOS:000661660800001,eissn,,Applied Sciences,Engineering,Mechanical Engineering & Transports,,,,


In [44]:
metrix_levels

['Domain_English', 'Field_English', 'SubField_English']

In [45]:
record_countries = locations[[record_col,"Country"]].drop_duplicates()
record_author_locations = author_locations[[record_col,"author_str_id","Country"]].drop_duplicates()
record_institution = univ_locations[[record_col,"Institution","Country"]].drop_duplicates()
country_types = locations[["Country","Country_Type"]].drop_duplicates()

In [46]:
# Basic network layout

In [47]:
country_collabs = record_countries.merge(record_countries, on=record_col)
country_collabs = country_collabs[country_collabs["Country_x"]!=country_collabs["Country_y"]]
country_collabs["weight"] = 0.5

In [48]:
inst_collabs = record_institution.merge(record_institution, on=record_col)
inst_collabs = inst_collabs[inst_collabs["Institution_x"]!=inst_collabs["Institution_y"]]
inst_collabs["weight"] = 0.5

In [49]:
wos.columns

Index(['Publication Type', 'Authors', 'Book Authors', 'Book Editors',
       'Book Group Authors', 'Author Full Names', 'Book Author Full Names',
       'Group Authors', 'Article Title', 'Source Title', 'Book Series Title',
       'Book Series Subtitle', 'Language', 'Document Type', 'Conference Title',
       'Conference Date', 'Conference Location', 'Conference Sponsor',
       'Conference Host', 'Author Keywords', 'Keywords Plus', 'Abstract',
       'Addresses', 'Affiliations', 'Reprint Addresses', 'Email Addresses',
       'Researcher Ids', 'ORCIDs', 'Funding Orgs', 'Funding Name Preferred',
       'Funding Text', 'Cited References', 'Cited Reference Count',
       'Times Cited, WoS Core', 'Times Cited, All Databases',
       '180 Day Usage Count', 'Since 2013 Usage Count', 'Publisher',
       'Publisher City', 'Publisher Address', 'ISSN', 'eISSN', 'ISBN',
       'Journal Abbreviation', 'Journal ISO Abbreviation', 'Publication Date',
       'Publication Year', 'Volume', 'Issue', 'Pa

In [50]:
drop_cols = [ws for ws in wos.columns if (("uthor" in ws or "ddress" in ws or "ORCID" in
                                           ws or "esearcher" in ws or "ditor" in ws or "name" in ws or 'SEQ' in ws) and "eyword" not in ws)]
drop_cols

['Authors',
 'Book Authors',
 'Book Editors',
 'Book Group Authors',
 'Author Full Names',
 'Book Author Full Names',
 'Group Authors',
 'Addresses',
 'Reprint Addresses',
 'Email Addresses',
 'Researcher Ids',
 'ORCIDs',
 'Publisher Address',
 '2.00 SEQ']

In [51]:
outdir="wos_processed_data"

In [52]:
os.makedirs(outdir, exist_ok=True)

wos.drop(columns=drop_cols).to_excel(f"{outdir}/wos_processed.xlsx", index=False)

record_countries.to_excel(f"{outdir}/wos_countries.xlsx", index=False)

record_author_locations.to_excel(f"{outdir}/wos_author_locations.xlsx", index=False)

record_institution.to_excel(f"{outdir}/wos_institution_locations.xlsx", index=False)

kw_df.to_excel(f"{outdir}/wos_keywords.xlsx", index=False)

country_types.to_excel(f"{outdir}/wos_country_types.xlsx", index=False)

In [53]:
wos.drop(columns=drop_cols).to_csv(f"{outdir}/wos_processed.csv", index=False, sep='\t')

record_countries.to_csv(f"{outdir}/wos_countries.csv", index=False, sep='\t')

record_author_locations.to_csv(f"{outdir}/wos_author_locations.csv", index=False, sep='\t')

record_institution.to_csv(f"{outdir}/wos_institution_locations.csv", index=False, sep='\t')

kw_df.to_csv(f"{outdir}/wos_keywords.csv", index=False, sep='\t')

country_types.to_csv(f"{outdir}/wos_country_types.csv", index=False, sep='\t')

inst_collabs.to_csv(f"{outdir}/wos_inst_collabs.csv", index=False, sep='\t')

country_collabs.to_csv(f"{outdir}/wos_country_collabs.csv", index=False, sep='\t')

In [54]:
wos_areas.to_csv(f"{outdir}/wos_research_areas.csv", index=False, sep='\t')

wos_subcat.to_csv(f"{outdir}/wos_categories.csv", index=False, sep='\t')