diff --git a/WOS/wos_analysis/u_institution_frequency_exploration.py b/WOS/wos_analysis/u_institution_frequency_exploration.py new file mode 100644 index 0000000..5613e5d --- /dev/null +++ b/WOS/wos_analysis/u_institution_frequency_exploration.py @@ -0,0 +1,14 @@ +#%% Compare the frequency of the inst. names with the wos-analysis on the web interface +import pandas as pd + +# %% +wos_inst_df = pd.read_excel("../wos_processed_data/wos_institution_locations_harmonized.xlsx") + +# %% Get rid of the duplicate institution name entries in each individual publication +wos_inst_df["id_inst"] = wos_inst_df['Institution'] + ";" + wos_inst_df["UT (Unique WOS ID)"] +wos_pubunique_inst = wos_inst_df["id_inst"].unique() +wos_pubunique_inst = [x.split(";")[0] for x in wos_pubunique_inst] + +# %% Calc. frequencies, and get an output +wos_inst_freqdf = pd.DataFrame(wos_pubunique_inst).value_counts().rename_axis('institution').reset_index(name='frequency') +wos_inst_freqdf.to_csv("../wos_processed_data/wos_pubunique-institution_frequency.csv", index=False) diff --git a/WOS/wos_processed_data/wos_pubunique-institution_frequency.csv b/WOS/wos_processed_data/wos_pubunique-institution_frequency.csv new file mode 100644 index 0000000..a175556 --- /dev/null +++ b/WOS/wos_processed_data/wos_pubunique-institution_frequency.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d4a58f241d2e9b130869eedae2119be43919952d6dd236cdcd4d613a48d3e6b +size 453006 diff --git a/WOS/wos_processed_data/~$wos_institution_locations_harmonized.xlsx b/WOS/wos_processed_data/~$wos_institution_locations_harmonized.xlsx new file mode 100644 index 0000000..a20b14c Binary files /dev/null and b/WOS/wos_processed_data/~$wos_institution_locations_harmonized.xlsx differ