In [None]:
import numpy as np
import pandas as pd
import os
import shutil
from flashgeotext.geotext import GeoText
import re

In [None]:
import hashlib

def md5hash(s: str):
 return hashlib.md5(s.encode('utf-8')).hexdigest()

In [None]:
record_col="UT (Unique WOS ID)"
outfile = r"C:\Users\radvanyi\PycharmProjects\ZSI_analytics\WOS\wos_extract\wos_records_concat.csv"

In [None]:
wos = pd.read_csv(outfile, sep="\t",low_memory=False)

wos = wos[((wos["Publication Year"]<2023)&(wos["Publication Year"]>2010))].copy()
print(f'Number of initial (valid interval) records: {len(wos)}')

metrix = pd.read_excel("sm_journal_classification.xlsx", sheet_name="Journal_Classification")


metrix = metrix.set_index([c for c in metrix.columns if "issn" not in c]).stack().reset_index()
metrix = metrix.rename(columns={'level_6':"issn_type", 0:"issn"})
metrix["issn"]=metrix["issn"].str.replace("-","").str.lower().str.strip()

wos["issn"] = wos["ISSN"].str.replace("-","").str.lower().str.strip()
wos["eissn"] = wos["eISSN"].str.replace("-","").str.lower().str.strip()
wos = wos.set_index([c for c in wos.columns if "issn" not in c]).stack().reset_index()
wos = wos.rename(columns={'level_71':"issn_var", 0:"issn"})

wos_merge = wos.merge(metrix, on="issn", how="left")



wos_indexed = wos_merge[~wos_merge["Domain_English"].isna()]
wos_unindexed = wos_merge[~wos_merge[record_col].isin(wos_indexed[record_col])]


wos_unindexed = wos_unindexed.sort_values(by=["issn_var"],ascending=False).drop_duplicates(subset=record_col)
wos = wos_indexed.sort_values(by=["issn_var"],ascending=False).drop_duplicates(subset=record_col)

wos_postmerge = wos.copy()
print(f'Number of METRIX filtered records: {len(wos)}')
print(f'Number of unindexed records: {len(wos_unindexed)}')

# drop entries not indexed by metrix
# drop duplicates (based on doi)
wos = wos[~((~wos["DOI"].isna())&(wos["DOI"].duplicated(False)))]
wos = wos.drop_duplicates(subset=["Publication Type","Document Type","Authors","Article Title","Source Title","Publication Year"])
print(f'Number of filtered records (dropping duplicates): {len(wos)}')

In [None]:
wos["Domain_English"].value_counts()

In [None]:
wos_classifier = wos[["WoS Categories","Research Areas"]+list(metrix.columns)].copy().drop_duplicates()
wos_classifier = wos_classifier.groupby(["WoS Categories","Research Areas"], as_index=False)[["Domain_English","Field_English","SubField_English"]].agg(
 lambda x: pd.Series.mode(x)[0])

In [None]:
wos_to_reindex = wos_unindexed.drop(columns=list(metrix.columns))
wos_found = wos_to_reindex.merge(wos_classifier, on=["WoS Categories","Research Areas"], how="inner")
# wos_found = wos_to_reindex.merge(wos_classifier, on="Research Areas", how="inner")
# # wos_found = wos_to_reindex.merge(wos_classifier, on="WoS Categories", how="inner")
wos_stillost = wos_unindexed[~wos_unindexed[record_col].isin(wos_found[record_col])]

print("Found:", wos_found[record_col].nunique(),"\nLost forever:", wos_stillost[record_col].nunique())

In [None]:
wos = pd.concat([wos,wos_found], ignore_index=True)
print(f'Number of records (after remerge): {len(wos)}')

In [None]:
wos["Domain_English"].value_counts()

In [None]:
wos_cat = wos.groupby(record_col)["WoS Categories"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns="level_1")
wos_cat["WoS Categories"] = wos_cat["WoS Categories"].str.strip()
wos_cat["WoS Categories"].value_counts()

In [None]:
wos_subcat = wos_cat.copy()
wos_subcat[['WoS Category', 'WoS SubCategory']] = wos_subcat["WoS Categories"].str.split(",", expand = True, n=1)
for c in ['WoS Category', 'WoS SubCategory',"WoS Categories"]:
 wos_subcat[c] = wos_subcat[c].str.strip()
wos_subcat.drop_duplicates(subset=[record_col,'WoS Category'])["WoS Category"].value_counts()

In [None]:
wos_areas = wos.groupby(record_col)["Research Areas"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns="level_1")
wos_areas["Research Areas"] = wos_areas["Research Areas"].str.strip()
wos_areas["Research Areas"].value_counts()

In [None]:
wos[["Article Title","Keywords Plus","Author Keywords"]].sample(100)

In [None]:
kw_df = pd.DataFrame()
for c in ["Keywords Plus","Author Keywords"]:
 kwp = wos.groupby(record_col)[c].apply(lambda x: x.str.split(';')).explode().str.strip().str.upper()
 kwp.name = 'keyword_all'
 kw_df = pd.concat([kwp.reset_index(),kw_df],ignore_index=True)
kw_df = kw_df[~kw_df["keyword_all"].isna()].copy().drop(columns="level_1").drop_duplicates()
kw_df["keyword_all"] = kw_df["keyword_all"].apply(lambda x: re.sub("[\(\[].*?[\)\]]", "", x))
kw_df.head(100)

In [None]:
wos_kwd_concat = kw_df.groupby(record_col, as_index=False).agg({'keyword_all': '; '.join})
wos_kwd_concat.head()

In [None]:
wos.columns

In [None]:
geotext = GeoText()

def extract_location(input_text, key='countries'):
 anomalies = {"Malta":"Malta",
 "Mongolia":"Mongolia",
 "Quatar":"Qatar",
 "Qatar":"Qatar",
 "Ethiop":"Ethiopia",
 "Nigeria":"Nigeria",
 "BELAR":"Belarus",
 "Venezuela":"Venezuela",
 "Cyprus":"Cyprus",
 "Ecuador":"Ecuador",
 "U Arab":"United Arab Emirates",
 "Syria":"Syria",
 "Uganda":"Uganda",
 "Yemen":"Yemen",
 "Mali":"Mali",
 "Senegal":"Senegal",
 "Vatican":"Vatican",
 "Uruguay":"Uruguay",
 "Panama":"Panama",
 "Fiji":"Fiji",
 "Faroe":"Faroe Islands",
 "Macedonia":"Macedonia",
 'Mozambique':'Mozambique',
 "Kuwait":"Kuwait",
 "Libya":"Libya",
 "Turkiy":"Turkey",
 "Liberia":"Liberia",
 "Namibia":"Namibia",
 "Ivoire":"Ivory Coast",
 "Guatemala":"Gutemala",
 "Paraguay":"Paraguay",
 "Honduras":"Honduras",
 "Nicaragua":"Nicaragua",
 "Trinidad":"Trinidad & Tobago",
 "Liechtenstein":"Liechtenstein",
 "Greenland":"Denmark"}

 extracted = geotext.extract(input_text=input_text)
 found = extracted[key].keys()
 if len(sorted(found))>0:
 return sorted(found)[0]
 elif key=='countries':
 for i in ['Scotland','Wales','England', 'N Ireland']:
 if i in input_text:
 return 'United Kingdom'
 for j in anomalies.keys():
 if j in input_text:
 return anomalies.get(j)
 else:
 return None

with open('../eu_members.txt',"r") as f:
 eu_countries=f.readline().split(",")
 eu_countries=[i.strip() for i in eu_countries]

def country_cleanup(country):
 if "USA" in country:
 return "USA"
 elif "China" in country:
 return "China"
 elif country in ["England", "Northern Ireland", "Wales", "Scotland","N Ireland"]:
 return "United Kingdom"
 else:
 return country


def country_type(country):
 if country in eu_countries:
 return "EU"
 elif country=="China":
 return "China"
 elif country in ["Switzerland", 'Norway','United Kingdom']:
 return "Non-EU associate"
 else:
 return "Other"


In [None]:
locations = wos.groupby(record_col)["Addresses"].apply(lambda x: x.str.split('[')).explode().reset_index().drop(columns="level_1")


locations = locations[locations["Addresses"]!=""].copy()
locations["Address"] = locations["Addresses"].apply(lambda x:x.split("]")[-1])
locations["Authors_of_address"] = locations["Addresses"].apply(lambda x:x.split("]")[0])

In [None]:
len(locations)

In [None]:
locations["Address"] = locations["Address"].str.strip().str.strip(";")
locations = locations.groupby([record_col,"Authors_of_address"])["Address"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns="level_2")
locations.head(100)

In [None]:
# import dask.dataframe as dd
#
# locations_ddf = dd.from_pandas(locations, npartitions=4) # convert pandas DataFrame to Dask DataFrame
# loc_compute = locations_ddf.groupby([record_col,"Authors_of_address"])["Address"].apply(lambda x: x.str.split(';')).explode().compute() # compute the result

In [None]:
# locations_test = locations.head(1000)
# locations_test = locations_test.groupby([record_col,"Authors_of_address"])["Address"].str.split(';').explode()
# locations_test

In [None]:

# locations["Country"]=locations['Address'].apply(lambda x: extract_location(input_text=x, key='countries'))
locations["Country"]=locations['Address'].apply(lambda x: x.split(",")[-1].strip(" ").strip(";").strip(" "))
locations["Country"]=locations['Country'].apply(lambda x: country_cleanup(x))
locations["City"]=locations['Address'].apply(lambda x: extract_location(input_text=x, key='cities'))
locations["Country_Type"] = locations["Country"].apply(lambda x: country_type(x))

In [None]:
scope_types = ["EU","China","Non-EU associate"]
locations=locations[locations["Country_Type"].isin(scope_types)]

In [None]:
univ_locations = locations[[record_col,"Address","Country","City","Country_Type"]].copy()
univ_locations["Institution"] = univ_locations["Address"].apply(lambda x: x.split(",")[0])
univ_locations = univ_locations.drop_duplicates()
univ_locations.head()

In [None]:
author_locations = locations.groupby([record_col,"Country","Country_Type"])["Authors_of_address"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns="level_3")
author_locations["Author_name"] = author_locations["Authors_of_address"].str.strip()
author_locations = author_locations.drop(columns="Authors_of_address")
author_locations["author_str_id"] = author_locations["Author_name"].apply(lambda x:''.join(filter(str.isalnum, x.lower())))
author_locations["author_str_id"] = author_locations["author_str_id"].apply(md5hash)
author_locations = author_locations.drop(columns="Author_name")
author_locations.head()

In [None]:
author_locations[author_locations['author_str_id'].duplicated(False)]

In [None]:
author_primary_region = author_locations.sort_values(by="Country_Type").drop_duplicates(subset=[record_col,"author_str_id"])
# author_primary_region

china=author_primary_region[author_primary_region["Country_Type"]=="China"][record_col].unique()
eu=author_primary_region[author_primary_region["Country_Type"]=="EU"][record_col].unique()
assoc=author_primary_region[author_primary_region["Country_Type"]=="Non-EU associate"][record_col].unique()


# records that have distinct authors with different country affiliations
valid_scope = wos[((wos[record_col].isin(china))
 &
 ((wos[record_col].isin(eu))
 |
 (wos[record_col].isin(assoc))))][record_col].unique()

In [None]:
author_primary_region.head()

In [None]:
print(f'Number of records: {len(wos)}')
print(f'Number of valid cooperation records: {len(valid_scope)}')

In [None]:
wos = wos[wos[record_col].isin(valid_scope)]
locations = locations[locations[record_col].isin(valid_scope)]
univ_locations = univ_locations[univ_locations[record_col].isin(valid_scope)]
author_locations = author_locations[author_locations[record_col].isin(valid_scope)]
author_primary_region = author_locations[author_locations[record_col].isin(valid_scope)]

In [None]:
affiliations = wos.groupby(record_col)["Affiliations"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns="level_1")
affiliations["Affiliations"] = affiliations["Affiliations"].str.strip().str.upper().fillna("UNKNOWN")
affiliations = affiliations.drop_duplicates()

In [None]:
affiliations["Affiliations"].value_counts()

In [None]:
univ_locations["Institution"].value_counts()

In [None]:
univ_locations[record_col].nunique()

In [None]:
affiliations[record_col].nunique()

In [None]:
univ_locations["Institution"].value_counts().sum()

In [None]:
affiliations["Affiliations"].value_counts().sum()

In [None]:
wos_cat = wos.groupby(record_col)["WoS Categories"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns="level_1")
wos_cat["WoS Categories"].value_counts()

In [None]:
wos_areas = wos.groupby(record_col)["Research Areas"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns="level_1")
wos_areas["Research Areas"] = wos_areas["Research Areas"].str.strip()
wos_areas["Research Areas"].value_counts()

In [None]:
[c for c in wos.columns if "_English" in c]

In [None]:
metrix_levels = [c for c in wos.columns if "_English" in c]
for m in metrix_levels:
 wos[m] = wos[m].replace({"article-level classification":"Multidisciplinary"})


In [None]:
wos

In [None]:
metrix_levels

In [None]:
record_countries = locations[[record_col,"Country"]].drop_duplicates()
record_author_locations = author_locations[[record_col,"author_str_id","Country"]].drop_duplicates()
record_institution = univ_locations[[record_col,"Institution","Country"]].drop_duplicates()
country_types = locations[["Country","Country_Type"]].drop_duplicates()

In [None]:
# Basic network layout

In [None]:
country_collabs = record_countries.merge(record_countries, on=record_col)
country_collabs = country_collabs[country_collabs["Country_x"]!=country_collabs["Country_y"]]
country_collabs["weight"] = 0.5

In [None]:
inst_collabs = record_institution.merge(record_institution, on=record_col)
inst_collabs = inst_collabs[inst_collabs["Institution_x"]!=inst_collabs["Institution_y"]]
inst_collabs["weight"] = 0.5

In [None]:
wos.columns

In [None]:
drop_cols = [ws for ws in wos.columns if (("uthor" in ws or "ddress" in ws or "ORCID" in
 ws or "esearcher" in ws or "ditor" in ws or "name" in ws or 'SEQ' in ws) and "eyword" not in ws)]
drop_cols

In [None]:
outdir="wos_processed_data"

In [None]:
os.makedirs(outdir, exist_ok=True)

wos.drop(columns=drop_cols).to_excel(f"{outdir}/wos_processed.xlsx", index=False)

record_countries.to_excel(f"{outdir}/wos_countries.xlsx", index=False)

record_author_locations.to_excel(f"{outdir}/wos_author_locations.xlsx", index=False)

record_institution.to_excel(f"{outdir}/wos_institution_locations.xlsx", index=False)

kw_df.to_excel(f"{outdir}/wos_keywords.xlsx", index=False)

country_types.to_excel(f"{outdir}/wos_country_types.xlsx", index=False)

In [None]:
wos.drop(columns=drop_cols).to_csv(f"{outdir}/wos_processed.csv", index=False, sep='\t')

record_countries.to_csv(f"{outdir}/wos_countries.csv", index=False, sep='\t')

record_author_locations.to_csv(f"{outdir}/wos_author_locations.csv", index=False, sep='\t')

record_institution.to_csv(f"{outdir}/wos_institution_locations.csv", index=False, sep='\t')

kw_df.to_csv(f"{outdir}/wos_keywords.csv", index=False, sep='\t')

country_types.to_csv(f"{outdir}/wos_country_types.csv", index=False, sep='\t')

inst_collabs.to_csv(f"{outdir}/wos_inst_collabs.csv", index=False, sep='\t')

country_collabs.to_csv(f"{outdir}/wos_country_collabs.csv", index=False, sep='\t')

In [None]:
wos_areas.to_csv(f"{outdir}/wos_research_areas.csv", index=False, sep='\t')

wos_subcat.to_csv(f"{outdir}/wos_categories.csv", index=False, sep='\t')