import pandas as pd
import janitor
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import MaxNLocator
import math
import plotly.express as px
%matplotlib inline


sns.set_theme(context='notebook', style='ticks', palette='colorblind', font='sans-serif', font_scale=1, color_codes=True, rc=None)
sns.palplot(sns.color_palette())


outdir="wos_processed_data"

wos = pd.read_excel(f"../{outdir}/wos_processed.xlsx")
wos_univ = pd.read_excel(f"../{outdir}/wos_institution_locations_harmonized.xlsx")


wos_country = pd.read_excel(f"../{outdir}/wos_countries.xlsx")
wos_country_types = pd.read_excel(f"../{outdir}/wos_country_types.xlsx")


wos_country_types


# len(wos),len(wos_univ_locations)


# wos_addresses = pd.read_excel(f"/{outdir}/wos_addresses.xlsx")

# wos_affiliations = pd.read_excel(f"/{outdir}/wos_affiliations.xlsx")

# wos_author_locations = pd.read_excel(f"/{outdir}/wos_author_locations.xlsx")

# wos_univ_locations = pd.read_excel(f"/{outdir}/wos_univ_locations.xlsx")


record_col = "UT (Unique WOS ID)"


# def nth_repl_all(s, sub="", repl="<br>", nth=2):
#     find = s.find(sub)
#     # loop util we find no match
#     i = 1
#     while find != -1:
#         # if i  is equal to nth we found nth matches so replace
#         if i == nth:
#             s = s[:find]+repl+s[find + len(sub):]
#             i = 0
#         # find + len(sub) + 1 means we start after the last match
#         find = s.find(sub, find + len(sub) + 1)
#         i += 1
#     return s.replace("<br>&","&<br")

def replace_nth(s, sub=" ", repl="<br>", n=2):
    chunks = s.split(sub)
    size = len(chunks)
    rows = size // n + (0 if size % n == 0 else 1)
    return (repl.join([
        sub.join([chunks[i * n + j] for j in range(n if (i + 1) * n < size else size - i * n)])
        for i in range(rows)
    ])).replace("<br>&"," &<br>")


groups = ['Domain_English',"Field_English",'SubField_English']
data = wos.groupby(groups, as_index=False)[record_col].nunique().sort_values(ascending=False, by=record_col)
data["percent"] = data[record_col]/data[record_col].sum()

data[groups] = data[groups].applymap(replace_nth)
data


fig = px.sunburst(data, path=groups, values=record_col,
                  color='Domain_English',title="Distribution of topics<br>(METRIX classification)", template='plotly')
fig.update_traces(hovertemplate='%{id}<br>%{value:d}')
fig.show(config= dict(displayModeBar = False))


group = 'Domain_English'
data = wos.groupby(group, as_index=False)[record_col].nunique().sort_values(ascending=False, by=record_col)
data


g = sns.barplot(data, x=record_col, y=group)
g.set_xlim(0,35000)
g.set_ylabel(None)
g.set_xlabel("Number of co-publications")
g.set_title("Distribution of Domains")
for i in g.containers:
    g.bar_label(i,fontsize=10)


fig = px.bar(data, x=record_col, y=group, color=group,
                              labels={
                     record_col: 'Number of co-publications',
                     group: "",
                 },
                title="Distribution of Domains", template='plotly')
fig.update_layout(showlegend=False, xaxis_tickformat='d',font_family="Montserrat")
fig.update_traces(hovertemplate='%{x:d}')
fig.add_shape(
        # Rectangle with reference to the plot
            type="rect",
            xref="paper",
            yref="paper",
            x0=0,
            y0=0,
            x1=1.0,
            y1=1.0,
            line=dict(
                color="black",
                 width=0.5,
             )
         )
fig.update_yaxes(
    showgrid=True,
    ticks="outside")
fig.update_xaxes(
    showgrid=True,
    ticks="outside")
fig.show(config= dict(displayModeBar = False))


# # define a function to divide each row's 'Count' by the value of the first year
# def divide_by_first_year(group):
#     group['relative_growth'] = group[record_col] / group.loc[group['Publication Year'] == group['Publication Year'].min(), record_col].values[0]
#     return group
#
#
#
# data = (wos.groupby(group)[record_col].nunique()
#         .unstack(fill_value=0).stack()
#         .reset_index()
#         .rename(columns={0:record_col})
#         .sort_values(ascending=False, by=group+[record_col]))
#
# # group by 'Topic'
# grouped = data.groupby('Domain_English')
# # apply the function to each group
# data = grouped.apply(divide_by_first_year).reset_index(drop=True)
# data['relative_growth'] = data['relative_growth']*100


group = ['Publication Year','Domain_English']
data = (wos.groupby(['Publication Year','Domain_English'])[record_col].nunique(dropna=False).unstack()
        .fillna(0)
        .stack()
        .reset_index()
        .rename(columns={0:record_col}))
data = data.merge(data[data[record_col]>0].sort_values(by=["Publication Year"], ascending=True).drop_duplicates(subset='Domain_English'),
                  on='Domain_English', suffixes=[None,"_relative_growth"])
data[record_col+"_relative_growth"] = (data[record_col]-data[record_col+"_relative_growth"])/data[record_col+"_relative_growth"]*100
data


g=sns.lineplot(data.sort_values(ascending=True, by=group[-1]),y=record_col,x=group[0], hue=group[-1], marker="o")
g.set(xticks=list(range(2012,2022+1,2)))
g.legend(title=None)
g.set_xlabel(None)
g.set_ylabel(None)
g.set_title("Yearly output of co-publications")

Text(0.5, 1.0, 'Yearly output of co-publications')


fig = px.line(data.sort_values(ascending=[True,True], by=[group[0],group[-1]]),y=record_col,x=group[0], color=group[-1], markers=True,                             labels={
                     record_col: 'Number of co-publications',
                     group[-1]: "Domain",
                 },
                title="Yearly output of co-publications", template='plotly')
fig.update_traces(hovertemplate='%{y:d}')
fig.update_layout(hovermode='x unified')
fig.add_shape(
        # Rectangle with reference to the plot
            type="rect",
            xref="paper",
            yref="paper",
            x0=0,
            y0=0,
            x1=1.0,
            y1=1.0,
            line=dict(
                color="black",
                 width=0.5,
             )
         )
fig.update_yaxes(
    showgrid=True,
    ticks="outside")
fig.update_xaxes(
    showgrid=True,
    ticks="outside")
fig.show(config= dict(displayModeBar = False))


fig = px.line(data.sort_values(ascending=[True,True], by=[group[0],group[-1]]),y=record_col+"_relative_growth",x=group[0], color=group[-1], markers=True,                             labels={
                     record_col+"_relative_growth": 'Rel. growth<br>in co-publications (%)',
                     group[-1]: "Domain",
                 },
                title="Relative growth in the output of co-publications", template='plotly')
fig.update_traces(hovertemplate='%{y:.2f}%')

fig.update_layout(hovermode='x unified',yaxis_tickformat='d',font_family="Montserrat")
fig.add_shape(
        # Rectangle with reference to the plot
            type="rect",
            xref="paper",
            yref="paper",
            x0=0,
            y0=0,
            x1=1.0,
            y1=1.0,
            line=dict(
                color="black",
                 width=0.5,
             )
         )
fig.update_yaxes(
    showgrid=True,
    ticks="outside")
fig.update_xaxes(
    showgrid=True,
    ticks="outside")
fig.show(config= dict(displayModeBar = False))


pivot_data = pd.pivot_table(data, values=record_col, index=['Domain_English'],

                       columns=['Publication Year'], fill_value=0)
pivot_data


f, ax = plt.subplots(figsize=(9, 6))
g = sns.heatmap(pivot_data, annot=True, fmt="d", linewidths=.5, ax=ax)
g.set(xlabel="", ylabel="")

[Text(0.5, 33.249999999999986, ''), Text(79.74999999999999, 0.5, '')]


import numpy as np
percent_pivot = pd.crosstab(data['Domain_English'], data['Publication Year'], values=data[record_col], aggfunc=np.sum, normalize='columns')*100
percent_pivot


f, ax = plt.subplots(figsize=(15, 6))
g = sns.heatmap(percent_pivot, annot=True, fmt='.2f', linewidths=.5, ax=ax, cbar=False)
for t in ax.texts: t.set_text(t.get_text() + " %")
g.set(xlabel="", ylabel="")

[Text(0.5, 33.249999999999986, ''), Text(154.75, 0.5, '')]


percent_pivot.T.plot(kind='bar',
                    stacked=True,
                    figsize=(10, 6))

<Axes: xlabel='Publication Year'>


percent_pivot.T.plot(kind='bar',
                        stacked=True,
                        figsize=(15, 8))

plt.legend(loc="lower left", ncol=2)
# plt.ylabel("Release Year")
# plt.xlabel("Proportion")


for n, x in enumerate([*pivot_data.T.index.values]):
    for (proportion, count, y_loc) in zip(percent_pivot.T.loc[x],
                                          pivot_data.T.loc[x],
                                          percent_pivot.T.loc[x].cumsum()):

        plt.text(y=(y_loc - proportion) + (proportion / 2),
                 x=n - 0.11,
                 s=f'{count}',# ({np.round(proportion, 1)}%)',
                 color="black",
                 fontsize=8,
                 fontweight="bold")

plt.show()


group = ['Publication Year',"Domain_English",'Field_English']
data = wos.groupby(group, as_index=False)[record_col].nunique().sort_values(ascending=False, by=group+[record_col])
data


len(data[group[-2]].unique())

6


data_complete = pd.DataFrame()

for cat in sorted(data[group[-2]].unique()):
    #data segment
    sub_data = data[data[group[-2]]==cat]
    sub_data = sub_data.complete({group[0]:range(int(data[group[0]].min()), int(data[group[0]].max()) + 1)}
                                 ,group[-1],fill_value=0)
    data_complete = pd.concat([data_complete,sub_data], ignore_index=True)
    #plot
    g=sns.lineplot(sub_data.sort_values(ascending=True, by=group[-1]),
                   y=record_col,x=group[0], hue=group[-1], marker="o")
    g.set(xticks=list(range(2012,2022+1,2)))
    g.legend(title=None)
    g.set_title(cat)
    g.yaxis.set_major_locator(MaxNLocator(integer=True))
    plt.show()


data_complete = pd.DataFrame()

# Creating subplot axes
fig, axes = plt.subplots(nrows=3,ncols=2,figsize=(15, 15))

for cat,ax in zip(sorted(data[group[-2]].unique()),axes.flatten()):
    #data segment
    sub_data = data[data[group[-2]]==cat]
    sub_data = sub_data.complete({group[0]:range(int(data[group[0]].min()), int(data[group[0]].max()) + 1)}
                                 ,group[-1],fill_value=0)
    data_complete = pd.concat([data_complete,sub_data], ignore_index=True)
    #plot
    g=sns.lineplot(sub_data.sort_values(ascending=True, by=group[-1]),
                   y=record_col,x=group[0], hue=group[-1], marker="o", ax=ax)
    g.set(xticks=list(range(2012,2022+1,2)))
    g.legend(title=None)
    g.set_title(cat)
    g.set_xlabel(None)
    g.set_ylabel(None)
    g.yaxis.set_major_locator(MaxNLocator(integer=True))
fig.suptitle("Number of co-publications in domains and respective fields", y=0.92)
plt.show()


group = ['Publication Year',"Domain_English",'Field_English',"SubField_English"]
data = wos.groupby(group, as_index=False)[record_col].nunique().sort_values(ascending=False, by=group+[record_col])
data


for cat in sorted(data[group[-2]].unique()):
    sub_data = data[data[group[-2]]==cat]
    sub_data = sub_data.complete({group[0]:range(int(data[group[0]].min()), int(data[group[0]].max()) + 1)}
                                 ,group[-1],fill_value=0)
    g=sns.lineplot(sub_data.sort_values(ascending=True, by=group[-1]),y=record_col,x=group[0],
                   hue=group[-1], marker="o", errorbar=None)
    g.set(xticks=list(range(2012,2022+1,2)))
    g.legend(title=None,bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., ncols=math.ceil(len(g.legend_.texts)/12))
    g.set_title(f'Number or co-publications in {cat}')
    g.set_ylabel(None)
    plt.show()


len(sorted(data[group[-2]].unique()))

20


from  matplotlib.ticker import FuncFormatter
import math
def orderOfMagnitude(number):
    return math.floor(math.log(number, 10))

def roundToNearest(number):
    order = orderOfMagnitude(number)
    # if order!=0:
    #     order+=1
    near = math.ceil(number/10**order)*10**order
    return near


wos_univ_locations = wos_univ.merge(wos_country_types, on="Country")
wos_univ_locations.sample(100)


wos_collabs = wos_univ_locations[wos_univ_locations["Country_Type"]!="Other"][[record_col,"Country"]].drop_duplicates()


collab_desc = wos_collabs[wos_collabs["Country"]!="China"]["Country"].value_counts().reset_index()
collab_desc["percent_of_copubs"] = collab_desc["count"]/wos_collabs[record_col].nunique()*100
collab_desc["percent_contrib_in_copubs"] = collab_desc["count"]/wos_collabs[record_col].size*100
collab_desc = collab_desc.merge(wos_country_types, on="Country")
collab_desc

c_dict = {"count":"Number of co-publications",
          "percent_of_copubs":"Percent of co-publications",
          "percent_contrib_in_copubs":"Contribution to co-publications"}


# Creating subplot axes
# fig, axes = plt.subplots(ncols=3,figsize=(15, 15))
# for c,ax in zip(c_dict.keys(),axes.flatten()):
for c in c_dict.keys():
    data = collab_desc[["Country",c,"Country_Type"]]
    plt.figure(figsize=(9,12))
    g = sns.barplot(data, x=c, y="Country", hue="Country_Type", dodge=False)
    g.set_xlim(0,roundToNearest(data[c].max()))
    g.set_ylabel(None)
    g.set_xlabel(c_dict.get(c))
    g.set_title(c_dict.get(c))
    g.legend(title=None, loc="right")
    for i in g.containers:
        g.bar_label(i,fontsize=10, fmt='%.1f%%' if 'percent' in c else '%.0f')
    if 'percent' in c:
        g.xaxis.set_major_locator(MaxNLocator(integer=True))
        vals = g.get_xticks()
        g.set_xticklabels([str(int(val))+'%' for val in vals])
    plt.show()

C:\Users\radvanyi\AppData\Local\Temp\ipykernel_30956\556627507.py:29: UserWarning:

FixedFormatter should only be used together with FixedLocator

C:\Users\radvanyi\AppData\Local\Temp\ipykernel_30956\556627507.py:29: UserWarning:

FixedFormatter should only be used together with FixedLocator


wos_collabs_EU = wos_univ_locations[~wos_univ_locations["Country_Type"].isin(["Other","China"])][[record_col,"Country"]].drop_duplicates()
wos_collabs_EU = wos_collabs_EU.merge(wos_collabs_EU, on=record_col)
EU_co_occur = pd.crosstab(wos_collabs_EU['Country_x'], wos_collabs_EU['Country_y'], values=wos_collabs_EU[record_col], aggfunc='nunique', normalize='all').fillna(0)

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(EU_co_occur, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Draw the heatmap with the mask and correct aspect ratio
g = sns.heatmap(EU_co_occur, mask=mask,
            square=True, linewidths=.5)

g.set_ylabel(None)
g.set_xlabel(None)

Text(0.5, 71.74999999999994, '')


wos_collabs_EU = wos_univ_locations[~wos_univ_locations["Country_Type"].isin(["Other","China"])][[record_col,"Country"]].drop_duplicates()
wos_collabs_EU = wos_collabs_EU.merge(wos_collabs_EU, on=record_col)
wos_collabs_EU
EU_co_occur = pd.crosstab(wos_collabs_EU['Country_x'], wos_collabs_EU['Country_y'], values=wos_collabs_EU[record_col], aggfunc='nunique').fillna(0).astype(int)


# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(EU_co_occur, dtype=bool))
data = np.where(mask,None,EU_co_occur)
EU_co_occur.columns

Index(['Austria', 'Belgium', 'Bulgaria', 'Croatia', 'Cyprus', 'Czech Republic',
       'Denmark', 'Estonia', 'Finland', 'France', 'Germany', 'Greece',
       'Hungary', 'Ireland', 'Italy', 'Latvia', 'Lithuania', 'Luxembourg',
       'Malta', 'Netherlands', 'Norway', 'Poland', 'Portugal', 'Romania',
       'Slovakia', 'Slovenia', 'Spain', 'Sweden', 'Switzerland',
       'United Kingdom'],
      dtype='object', name='Country_y')


fig = px.imshow(data,
                labels=dict(x="Country (x)", y="Country (y)", color="Co-publication"),
                x=list(EU_co_occur.columns),
                y=list(EU_co_occur.index), title="Intraeuropean patterns"
               )
fig.update_layout(title_x=0.5,
                   width=1000, height=1000,
                   xaxis_showgrid=False,
                   yaxis_showgrid=False,
                   yaxis_autorange='reversed', template='plotly_white')
fig.update_xaxes(tickangle= -90)
fig.update_yaxes(
    ticks="outside")
fig.update_xaxes(
    ticks="outside")
fig.show(config= dict(displayModeBar = False))


collab_year = wos_collabs[wos_collabs["Country"]!="China"].copy()
collab_year = collab_year.merge(wos_country_types, on="Country").merge(wos[[record_col,"Publication Year"]],on=record_col).drop_duplicates()
data = collab_year.groupby(["Publication Year",'Country_Type'],as_index=False)[record_col].nunique()


g=sns.lineplot(data,y=record_col,x="Publication Year", hue="Country_Type", marker="o")
g.set(xticks=list(range(2012,2022+1,2)))
g.legend(title=None)
g.set_xlabel(None)
g.set_ylabel(None)
g.set_title("Yearly output of co-publications with China")

Text(0.5, 1.0, 'Yearly output of co-publications with China')


import country_converter as coco
cc = coco.CountryConverter()

data = (collab_year.groupby(['Publication Year',"Country"])[record_col]
        .nunique(dropna=False).unstack()
        .fillna(0)
        .stack()
        .reset_index()
        .rename(columns={0:record_col}))
data = data.merge(data[data[record_col]>0].sort_values(by=["Publication Year"], ascending=True).drop_duplicates(subset="Country"),
                  on=["Country"], suffixes=[None,"_relative_growth"])
data[record_col+"_relative_growth"] = (data[record_col]-data[record_col+"_relative_growth"])/data[record_col+"_relative_growth"]*100
data


data["ISO3"] = cc.pandas_convert(series=data["Country"], to='ISO3')
fig = px.choropleth(data, locations="ISO3", color=record_col, hover_name="Country",
                    animation_frame='Publication Year', scope="europe", template='plotly', range_color=[data[record_col].min(),data[record_col].max()])
fig.show()


data["ISO3"] = cc.pandas_convert(series=data["Country"], to='ISO3')
fig = px.choropleth(data, locations="ISO3", color=record_col+"_relative_growth", hover_name="Country",
                    animation_frame='Publication Year', scope="europe", template='plotly',
                    range_color=[data[record_col+"_relative_growth"].min(),data[record_col+"_relative_growth"].max()])
fig.show()


[data[record_col+"_relative_growth"].min(),data[record_col+"_relative_growth"].max()]

[-100.0, 3700.0]


fig = px.line(data.sort_values(ascending=True, by='Publication Year'),y=record_col,x='Publication Year', color="Country", markers=True,
              labels={
                     record_col: 'Number of co-publications',
                 },
                title="Yearly output of co-publications", template='plotly')
fig.update_traces(hovertemplate='%{y:d}')
fig.update_layout(hovermode='x unified')
fig.add_shape(
        # Rectangle with reference to the plot
            type="rect",
            xref="paper",
            yref="paper",
            x0=0,
            y0=0,
            x1=1.0,
            y1=1.0,
            line=dict(
                color="black",
                 width=0.5,
             )
         )
fig.update_yaxes(
    showgrid=True,
    ticks="outside")
fig.update_xaxes(
    showgrid=True,
    ticks="outside")
fig.show(config= dict(displayModeBar = False))


fig = px.line(data.sort_values(ascending=True, by='Publication Year'),y=record_col+"_relative_growth",x='Publication Year', color="Country", markers=True,
              labels={
                     record_col+"_relative_growth": 'Relative growth of co-publications (%)',
                 },
                title="Relative growth of co-publications<br>(baseline: 2011)", template='plotly')
fig.update_traces(hovertemplate='%{y:d}%')
fig.add_shape(
        # Rectangle with reference to the plot
            type="rect",
            xref="paper",
            yref="paper",
            x0=0,
            y0=0,
            x1=1.0,
            y1=1.0,
            line=dict(
                color="black",
                 width=0.5,
             )
         )
fig.update_yaxes(
    showgrid=True,
    ticks="outside")
fig.update_xaxes(
    showgrid=True,
    ticks="outside")
fig.show(config= dict(displayModeBar = False))


year_pivot = pd.crosstab(collab_year['Country'], collab_year['Publication Year'], values=collab_year[record_col], aggfunc='nunique').fillna(0).astype(int)
year_pivot


f, ax = plt.subplots(figsize=(15, 15))
g = sns.heatmap(year_pivot, annot=True, fmt="d", linewidths=.5, ax=ax)
g.set(xlabel="", ylabel="")
for i in range(year_pivot.shape[0]+1):
    ax.axhline(i, color='white', lw=10)


year_percent_pivot = pd.crosstab(collab_year['Country'], collab_year['Publication Year'], values=collab_year[record_col], aggfunc='nunique', normalize='columns').fillna(0)*100
year_percent_pivot


f, ax = plt.subplots(figsize=(15, 15))
g = sns.heatmap(year_percent_pivot, annot=True, fmt='.1f', linewidths=(.5), ax=ax, cbar=False)
for t in ax.texts: t.set_text(t.get_text() + " %")
g.set(xlabel="", ylabel="")
for i in range(year_percent_pivot.shape[1]+1):
    ax.axvline(i, color='white', lw=10)


# Institutional collab


wos_univ_locations = wos_univ.merge(wos_country_types, on="Country")
wos_univ_collabs = wos_univ_locations[wos_univ_locations["Country_Type"]!="Other"][[record_col,"Country","Institution_harm","Country_Type"]].drop_duplicates()
wos_univ_collabs.sample(100)


TOPN = 25


wos_univ_ch = wos_univ_collabs[wos_univ_collabs["Country_Type"]=="China"]
wos_univ_eu = wos_univ_collabs[wos_univ_collabs["Country_Type"]!="China"]

wos_univ_eu_strict = wos_univ_collabs[wos_univ_collabs["Country_Type"]=="EU"]

data_eu = (wos_univ_eu.groupby(["Country","Institution_harm","Country_Type"], as_index=False)[record_col].nunique()
           .sort_values(by=record_col,ascending=False).head(TOPN).copy())

data_eu_strict = (wos_univ_eu_strict.groupby(["Country","Institution_harm","Country_Type"], as_index=False)[record_col].nunique()
           .sort_values(by=record_col,ascending=False).head(TOPN).copy())
data_eu_strict

data_ch = (wos_univ_ch.groupby(["Country","Institution_harm","Country_Type"], as_index=False)[record_col].nunique()
           .sort_values(by=record_col,ascending=False).head(TOPN).copy())


for data in [data_eu,data_eu_strict,data_ch]:
    fig = px.bar(data, x=record_col, y="Institution_harm", color="Country_Type",
                              labels={
                     record_col: 'Number of co-publications',
                     "Institution_harm": "Institution",
                                  "Country_Type":"Country type"
                 },
                title="Most visible institutions", template='plotly')
    fig.update_layout(xaxis_tickformat='d',font_family="Montserrat",yaxis={'categoryorder':'total ascending'},
                                         width=1000, height=1000,)
    fig.update_traces(hovertemplate='%{x:d}')
    fig.add_shape(
            # Rectangle with reference to the plot
                type="rect",
                xref="paper",
                yref="paper",
                x0=0,
                y0=0,
                x1=1.0,
                y1=1.0,
                line=dict(
                    color="black",
                     width=0.5,
                 )
             )
    fig.update_yaxes(
        showgrid=True,
        ticks="outside")
    fig.update_xaxes(
        showgrid=True,
        ticks="outside")
    fig.show(config= dict(displayModeBar = False))


wos_univ_test = wos_univ_locations[wos_univ_locations["Country_Type"]!="Other"][[record_col,"Country","Institution","Institution_harm","Country_Type"]].drop_duplicates()
www = wos_univ_test.groupby(["Institution","Institution_harm"], as_index=False)[record_col].nunique()
www[www["Institution_harm"]=="Chinese Acad Sci"]


wos_univ_ch = wos_univ_collabs[wos_univ_collabs["Country_Type"]=="China"]
wos_univ_eu = wos_univ_collabs[wos_univ_collabs["Country_Type"]!="China"]

wos_univ_dipol = wos_univ_eu.merge(wos_univ_ch, on=record_col, suffixes=('_eu', '_ch')).merge(wos[[record_col,"Domain_English","Field_English","SubField_English"]], on =record_col)
wos_univ_dipol.sample(100)


fig = px.parallel_categories(wos_univ_dipol[["Country_eu","Domain_English","Country_ch"]])
fig.show()


subfilter = ((wos_univ_dipol["Institution_harm_eu"].isin(data_eu["Institution_harm"]))&
             (wos_univ_dipol["Institution_harm_ch"].isin(data_ch["Institution_harm"])))

fig = px.parallel_categories(wos_univ_dipol[subfilter][["Country_eu","Domain_English","Country_ch"]])
fig.show()


subfilter = ((wos_univ_dipol["Institution_harm_eu"].isin(data_eu["Institution_harm"]))&
             (wos_univ_dipol["Institution_harm_ch"].isin(data_ch["Institution_harm"])))

fig = px.parallel_categories(wos_univ_dipol[subfilter][["Country_eu","Institution_harm_eu","Domain_English","Institution_harm_ch"]])
fig.show()


subfilter = ((wos_univ_dipol["Institution_harm_eu"].isin(data_eu_strict["Institution_harm"]))&
             (wos_univ_dipol["Institution_harm_ch"].isin(data_ch["Institution_harm"])))

fig = px.parallel_categories(wos_univ_dipol[subfilter][["Country_eu","Institution_harm_eu","Domain_English","Institution_harm_ch"]])
fig.show()


TODO:
 - Focus sectors:
    -

	Domain_English	Field_English	SubField_English	UT (Unique WOS ID)	percent
37	Applied Sciences	Information &<br>Communication Technologies	Artificial Intelligence &<br> Image<br>Processing	7915	0.171841
44	Applied Sciences	Information &<br>Communication Technologies	Networking &<br>Telecommunications	5360	0.116370
32	Applied Sciences	Engineering	Geological &<br>Geomatics Engineering	2576	0.055927
33	Applied Sciences	Engineering	Industrial Engineering &<br> Automation	2316	0.050282
15	Applied Sciences	Enabling &<br>Strategic Technologies	Energy	1965	0.042662
...	...	...	...	...	...
11	Applied Sciences	Economics &<br>Business	Business &<br>Management	1	0.000022
46	Applied Sciences	Social Sciences	Anthropology	1	0.000022
54	Arts &<br>Humanities	Philosophy &<br>Theology	Philosophy	1	0.000022
52	Arts &<br>Humanities	Historical Studies	History of<br>Social Sciences	1	0.000022
129	Health Sciences	Psychology &<br>Cognitive Sciences	General Psychology &<br> Cognitive<br>Sciences	1	0.000022

	Domain_English	UT (Unique WOS ID)
0	Applied Sciences	29985
5	Natural Sciences	8457
3	Health Sciences	5341
2	Economic & Social Sciences	1360
4	Multidisciplinary	847
1	Arts & Humanities	70

	Publication Year	Domain_English	UT (Unique WOS ID)	Publication Year_relative_growth	UT (Unique WOS ID)_relative_growth
0	2011	Applied Sciences	490.0	2011	0.000000
1	2012	Applied Sciences	593.0	2011	21.020408
2	2013	Applied Sciences	738.0	2011	50.612245
3	2014	Applied Sciences	1031.0	2011	110.408163
4	2015	Applied Sciences	1201.0	2011	145.102041
...	...	...	...	...	...
67	2018	Natural Sciences	753.0	2011	316.022099
68	2019	Natural Sciences	999.0	2011	451.933702
69	2020	Natural Sciences	1232.0	2011	580.662983
70	2021	Natural Sciences	1403.0	2011	675.138122
71	2022	Natural Sciences	1665.0	2011	819.889503

Publication Year	2011	2012	2013	2014	2015	2016	2017	2018	2019	2020	2021	2022
Domain_English
Applied Sciences	490	593	738	1031	1201	1535	1920	2808	3729	4446	5295	6199
Arts & Humanities	0	0	0	4	1	3	7	4	11	11	16	13
Economic & Social Sciences	20	22	29	28	34	40	84	105	160	211	252	375
Health Sciences	116	120	155	184	216	243	321	403	611	755	1035	1182
Multidisciplinary	15	21	43	52	57	64	75	76	83	97	115	149
Natural Sciences	181	223	298	318	380	437	568	753	999	1232	1403	1665

Publication Year	2011	2012	2013	2014	2015	2016	2017	2018	2019	2020	2021	2022
Domain_English
Applied Sciences	59.610706	60.572012	58.432304	63.760049	63.578613	66.106804	64.537815	67.678959	66.672626	65.847156	65.241498	64.687467
Arts & Humanities	0.000000	0.000000	0.000000	0.247372	0.052938	0.129199	0.235294	0.096409	0.196674	0.162915	0.197141	0.135657
Economic & Social Sciences	2.433090	2.247191	2.296120	1.731602	1.799894	1.722653	2.823529	2.530730	2.860719	3.125000	3.104978	3.913180
Health Sciences	14.111922	12.257406	12.272367	11.379097	11.434621	10.465116	10.789916	9.713184	10.924370	11.181872	12.752587	12.334342
Multidisciplinary	1.824818	2.145046	3.404592	3.215832	3.017470	2.756245	2.521008	1.831767	1.483998	1.436611	1.416954	1.554837
Natural Sciences	22.019465	22.778345	23.594616	19.666048	20.116464	18.819983	19.092437	18.148952	17.861613	18.246445	17.286841	17.374517

Output - per yer, by Metrix taxonomy¶

Domains¶

Field¶

SubField¶

Collabs¶

	Country	Country_Type
0	Belgium	EU
1	China	China
2	Luxembourg	EU
3	Netherlands	EU
4	Norway	Non-EU associate
5	United Kingdom	Non-EU associate
6	France	EU
7	Sweden	EU
8	Italy	EU
9	Denmark	EU
10	Germany	EU
11	Slovenia	EU
12	Estonia	EU
13	Finland	EU
14	Bulgaria	EU
15	Slovakia	EU
16	Spain	EU
17	Poland	EU
18	Czech Republic	EU
19	Greece	EU
20	Malta	EU
21	Austria	EU
22	Switzerland	Non-EU associate
23	Ireland	EU
24	Portugal	EU
25	Romania	EU
26	Hungary	EU
27	Cyprus	EU
28	Croatia	EU
29	Lithuania	EU
30	Latvia	EU

	Publication Year	Domain_English	Field_English	UT (Unique WOS ID)
233	2022	Natural Sciences	Physics & Astronomy	596
232	2022	Natural Sciences	Mathematics & Statistics	228
231	2022	Natural Sciences	Earth & Environmental Sciences	409
230	2022	Natural Sciences	Chemistry	251
229	2022	Natural Sciences	Biology	181
...	...	...	...	...
4	2011	Applied Sciences	Information & Communication Technologies	256
3	2011	Applied Sciences	Engineering	166
2	2011	Applied Sciences	Enabling & Strategic Technologies	53
1	2011	Applied Sciences	Built Environment & Design	6
0	2011	Applied Sciences	Agriculture, Fisheries & Forestry	9

	Publication Year	Domain_English	Field_English	SubField_English	UT (Unique WOS ID)
1598	2022	Natural Sciences	Physics & Astronomy	Optics	134
1597	2022	Natural Sciences	Physics & Astronomy	Nuclear & Particle Physics	65
1596	2022	Natural Sciences	Physics & Astronomy	Mathematical Physics	10
1595	2022	Natural Sciences	Physics & Astronomy	General Physics	31
1594	2022	Natural Sciences	Physics & Astronomy	Fluids & Plasmas	79
...	...	...	...	...	...
4	2011	Applied Sciences	Agriculture, Fisheries & Forestry	Forestry	1
3	2011	Applied Sciences	Agriculture, Fisheries & Forestry	Food Science	1
2	2011	Applied Sciences	Agriculture, Fisheries & Forestry	Fisheries	2
1	2011	Applied Sciences	Agriculture, Fisheries & Forestry	Dairy & Animal Science	2
0	2011	Applied Sciences	Agriculture, Fisheries & Forestry	Agronomy & Agriculture	3

	UT (Unique WOS ID)	Institution	Country	Institution_harm	merge_iter	Country_Type
41191	WOS:000538161600016	Anhui Univ	China	Anhui Univ	0	China
175692	WOS:000709411500003	Univ Porto	Portugal	Univ Porto	0	EU
75198	WOS:000831217100027	Zhejiang Univ	China	Zhejiang Univ	0	China
48614	WOS:000597938400003	Shanghai Jiao Tong Univ	China	Shanghai Jiao Tong Univ	0	China
133670	WOS:000411824101159	Univ Pisa	Italy	Univ Pisa	0	EU
...	...	...	...	...	...	...
2892	WOS:000293708200019	Natl Univ Def Technol	China	Natl Univ Def Technol	0	China
125259	WOS:000663324800010	INRAE	France	INRAE	0	EU
55780	WOS:000659952900011	Huazhong Univ Sci & Technol	China	Huazhong Univ Sci & Technol	0	China
138600	WOS:000744399000001	Brignone Clin	Italy	Brignone Clin	0	EU
31040	WOS:000471758500010	Chinese Acad Sci	China	Chinese Acad Sci	0	China

	Publication Year	Country	UT (Unique WOS ID)	Publication Year_relative_growth	UT (Unique WOS ID)_relative_growth
0	2011	Austria	22.0	2011	0.000000
1	2012	Austria	24.0	2011	9.090909
2	2013	Austria	26.0	2011	18.181818
3	2014	Austria	39.0	2011	77.272727
4	2015	Austria	50.0	2011	127.272727
...	...	...	...	...	...
355	2018	United Kingdom	1837.0	2011	406.060606
356	2019	United Kingdom	2430.0	2011	569.421488
357	2020	United Kingdom	3108.0	2011	756.198347
358	2021	United Kingdom	3718.0	2011	924.242424
359	2022	United Kingdom	4245.0	2011	1069.421488

Publication Year	2011	2012	2013	2014	2015	2016	2017	2018	2019	2020	2021	2022
Country
Austria	22	24	26	39	50	57	72	89	138	137	185	205
Belgium	34	38	40	65	71	81	90	133	179	213	242	292
Bulgaria	4	5	8	9	7	19	21	18	10	25	32	19
Croatia	1	2	6	8	10	7	10	19	27	29	33	35
Cyprus	2	1	5	5	5	5	8	7	15	28	36	43
Czech Republic	13	15	16	21	20	36	37	56	64	81	93	123
Denmark	35	33	40	59	68	74	101	195	234	245	293	343
Estonia	3	3	7	10	12	10	15	15	16	38	45	39
Finland	31	35	44	82	100	125	126	198	241	256	289	380
France	117	130	174	231	269	325	348	491	648	691	807	858
Germany	123	172	192	273	310	365	456	604	801	907	1210	1386
Greece	15	18	19	32	35	50	47	81	114	122	139	181
Hungary	11	11	21	16	20	38	34	47	61	61	83	90
Ireland	13	16	22	31	27	45	66	72	84	116	167	187
Italy	51	70	84	116	178	187	247	325	441	571	641	811
Latvia	0	0	1	0	1	8	10	15	10	9	13	18
Lithuania	1	2	10	4	4	13	12	23	38	36	38	38
Luxembourg	2	3	3	1	8	9	13	15	18	22	35	51
Malta	1	0	0	0	1	1	0	0	6	2	7	10
Netherlands	72	64	77	103	139	166	220	297	408	470	529	655
Norway	30	42	60	76	67	88	104	134	222	253	304	311
Poland	17	31	37	57	73	82	98	110	138	181	276	353
Portugal	16	23	35	41	45	58	79	119	136	147	204	212
Romania	7	15	13	16	25	26	37	57	64	55	48	62
Slovakia	9	6	6	10	12	22	18	27	27	34	36	45
Slovenia	7	7	10	12	17	27	22	47	54	31	48	40
Spain	50	49	69	112	138	185	232	273	356	386	473	640
Sweden	34	50	59	83	113	170	233	232	385	359	428	510
Switzerland	37	50	54	74	74	95	155	195	233	263	349	447
United Kingdom	363	417	531	660	781	979	1350	1837	2430	3108	3718	4245

Publication Year	2011	2012	2013	2014	2015	2016	2017	2018	2019	2020	2021	2022
Country
Austria	1.962533	1.801802	1.557819	1.736420	1.865672	1.699970	1.689744	1.552958	1.816267	1.543488	1.712804	1.623248
Belgium	3.033006	2.852853	2.396645	2.894034	2.649254	2.415747	2.112180	2.320712	2.355883	2.399730	2.240533	2.312139
Bulgaria	0.356824	0.375375	0.479329	0.400712	0.261194	0.566657	0.492842	0.314081	0.131614	0.281658	0.296269	0.150447
Croatia	0.089206	0.150150	0.359497	0.356189	0.373134	0.208768	0.234687	0.331530	0.355357	0.326724	0.305527	0.277140
Cyprus	0.178412	0.075075	0.299581	0.222618	0.186567	0.149120	0.187749	0.122143	0.197420	0.315457	0.333302	0.340486
Czech Republic	1.159679	1.126126	0.958658	0.934996	0.746269	1.073665	0.868341	0.977142	0.842327	0.912573	0.861031	0.973949
Denmark	3.122212	2.477477	2.396645	2.626892	2.537313	2.206979	2.370336	3.402548	3.079758	2.760252	2.712712	2.715971
Estonia	0.267618	0.225225	0.419413	0.445236	0.447761	0.298240	0.352030	0.261734	0.210582	0.428121	0.416628	0.308813
Finland	2.765388	2.627628	2.636309	3.650935	3.731343	3.728005	2.957052	3.454894	3.171887	2.884182	2.675678	3.008948
France	10.437110	9.759760	10.425404	10.284951	10.037313	9.692812	8.167097	8.567440	8.528560	7.785038	7.471530	6.793887
Germany	10.972346	12.912913	11.503895	12.154942	11.567164	10.885774	10.701713	10.539173	10.542248	10.218567	11.202666	10.974741
Greece	1.338091	1.351351	1.138406	1.424755	1.305970	1.491202	1.103027	1.413366	1.500395	1.374493	1.286918	1.433209
Hungary	0.981267	0.825826	1.258238	0.712378	0.746269	1.133313	0.797935	0.820101	0.802843	0.687247	0.768447	0.712645
Ireland	1.159679	1.201201	1.318155	1.380232	1.007463	1.342082	1.548932	1.256325	1.105554	1.306895	1.546153	1.480719
Italy	4.549509	5.255255	5.032954	5.164737	6.641791	5.577095	5.796761	5.670913	5.804159	6.433078	5.934636	6.421728
Latvia	0.000000	0.000000	0.059916	0.000000	0.037313	0.238592	0.234687	0.261734	0.131614	0.101397	0.120359	0.142529
Lithuania	0.089206	0.150150	0.599161	0.178094	0.149254	0.387712	0.281624	0.401326	0.500132	0.405588	0.351819	0.300895
Luxembourg	0.178412	0.225225	0.179748	0.044524	0.298507	0.268416	0.305093	0.261734	0.236904	0.247859	0.324044	0.403832
Malta	0.089206	0.000000	0.000000	0.000000	0.037313	0.029824	0.000000	0.000000	0.078968	0.022533	0.064809	0.079183
Netherlands	6.422837	4.804805	4.613541	4.585931	5.186567	4.950790	5.163107	5.182342	5.369834	5.295178	4.897695	5.186476
Norway	2.676182	3.153153	3.594967	3.383793	2.500000	2.624515	2.440742	2.338161	2.921822	2.850383	2.814554	2.462586
Poland	1.516503	2.327327	2.216896	2.537845	2.723881	2.445571	2.299930	1.919386	1.816267	2.039207	2.555319	2.795154
Portugal	1.427297	1.726727	2.097064	1.825467	1.679104	1.729794	1.854025	2.076426	1.789945	1.656151	1.888714	1.678676
Romania	0.624442	1.126126	0.778910	0.712378	0.932836	0.775425	0.868341	0.994591	0.842327	0.619648	0.444403	0.490934
Slovakia	0.802855	0.450450	0.359497	0.445236	0.447761	0.656129	0.422436	0.471122	0.355357	0.383055	0.333302	0.356323
Slovenia	0.624442	0.525526	0.599161	0.534283	0.634328	0.805249	0.516311	0.820101	0.710713	0.349256	0.444403	0.316731
Spain	4.460303	3.678679	4.134212	4.986643	5.149254	5.517447	5.444731	4.763567	4.685444	4.348806	4.379224	5.067701
Sweden	3.033006	3.753754	3.535051	3.695459	4.216418	5.070086	5.468200	4.048159	5.067123	4.044615	3.962596	4.038324
Switzerland	3.300624	3.753754	3.235470	3.294746	2.761194	2.833284	3.637644	3.402548	3.066596	2.963046	3.231182	3.539473
United Kingdom	32.381802	31.306306	31.815458	29.385574	29.141791	29.197733	31.682704	32.053743	31.982101	35.015773	34.422739	33.613113

	UT (Unique WOS ID)	Country	Institution_harm	Country_Type
104534	WOS:000536637200011	United Kingdom	Univ Warwick	Non-EU associate
120323	WOS:000373806800006	France	ENSAIT	EU
41841	WOS:000542956600003	China	Nanjing Univ Aeronaut & Astronaut	China
100019	WOS:000459844300007	United Kingdom	Univ Manchester	Non-EU associate
174151	WOS:000843324300007	Ireland	Trinity Coll Dublin	EU
...	...	...	...	...
157638	WOS:000863147500001	Finland	Univ Turku	EU
71835	WOS:000798227800116	China	Shanghai Jiao Tong Univ	China
128870	WOS:000460118200077	Sweden	Royal Inst Technol	EU
37822	WOS:000517665600048	China	Chinese Acad Sci	China
26625	WOS:000453750400001	China	Hangzhou Dianzi Univ	China

	Institution	Institution_harm	UT (Unique WOS ID)
16	Chinese Acad Sci	Chinese Acad Sci	1
3149	Chinese Acad Sci	Chinese Acad Sci	4614
3153	Chinese Acad Sci AIRCAS	Chinese Acad Sci	2
3155	Chinese Acad Sci CAREERI CAS	Chinese Acad Sci	1
3157	Chinese Acad Sci CASIA	Chinese Acad Sci	8
3159	Chinese Acad Sci GUCAS	Chinese Acad Sci	2
3160	Chinese Acad Sci IAP	Chinese Acad Sci	1
3161	Chinese Acad Sci IECAS	Chinese Acad Sci	2
3162	Chinese Acad Sci IME CAS	Chinese Acad Sci	1
3163	Chinese Acad Sci IMECAS	Chinese Acad Sci	1
3164	Chinese Acad Sci ITP CAS	Chinese Acad Sci	1
3166	Chinese Acad Sci NAOC	Chinese Acad Sci	1
3167	Chinese Acad Sci NAOC CAS	Chinese Acad Sci	2
13501	RCEES Chinese Acad Sci	Chinese Acad Sci	1
19499	ZIAT Chinese Acad Sci	Chinese Acad Sci	1

	UT (Unique WOS ID)	Country_eu	Institution_harm_eu	Country_Type_eu	Country_ch	Institution_harm_ch	Country_Type_ch	Domain_English	Field_English	SubField_English
263074	WOS:000597493300001	United Kingdom	Univ Northumbria Newcastle	Non-EU associate	China	Nanchang Univ	China	Natural Sciences	Chemistry	Analytical Chemistry
71907	WOS:000494411700001	Germany	Univ Wurzburg	EU	China	South China Agr Univ	China	Economic & Social Sciences	Social Sciences	Information & Library Sciences
303069	WOS:000569985300066	Italy	Selex	EU	China	Wuhan Elect Informat Inst	China	Applied Sciences	Engineering	Computation Theory & Mathematics
259937	WOS:000557391000036	United Kingdom	Univ Glasgow	Non-EU associate	China	Southwest Jiaotong Univ	China	Natural Sciences	Chemistry	Analytical Chemistry
302133	WOS:000477943300012	Italy	Politecn Milan	EU	China	City Univ Hong Kong	China	Applied Sciences	Information & Communication Technologies	Artificial Intelligence & Image Processing
...	...	...	...	...	...	...	...	...	...	...
179087	WOS:000460128100005	Finland	Univ Jyvaskyla	EU	China	Capital Med Univ	China	Health Sciences	Clinical Medicine	Ophthalmology & Optometry
333080	WOS:000589420400001	Ireland	Univ Coll Dublin	EU	China	Guangxi Normal Univ	China	Natural Sciences	Chemistry	Organic Chemistry
300958	WOS:000388876400003	Finland	Nokias Mobile Networks Organizat	EU	China	Nokia Bell Labs	China	Applied Sciences	Information & Communication Technologies	Networking & Telecommunications
95342	WOS:000579154000008	United Kingdom	Imperial Coll London	Non-EU associate	China	Wuhan Polytech Univ	China	Health Sciences	Clinical Medicine	General & Internal Medicine
197767	WOS:000571399800004	Switzerland	Univ Bern	Non-EU associate	China	Shandong Univ	China	Natural Sciences	Physics & Astronomy	General Physics