You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
871 lines
26 KiB
Plaintext
871 lines
26 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"import numpy as np\n",
|
|
"import pandas as pd\n",
|
|
"import os\n",
|
|
"import shutil\n",
|
|
"from flashgeotext.geotext import GeoText\n",
|
|
"import re"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"import hashlib\n",
|
|
"\n",
|
|
"def md5hash(s: str):\n",
|
|
" return hashlib.md5(s.encode('utf-8')).hexdigest()"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"record_col=\"UT (Unique WOS ID)\"\n",
|
|
"outfile = r\"C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\WOS\\wos_extract\\wos_records_concat.csv\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"wos = pd.read_csv(outfile, sep=\"\\t\",low_memory=False)\n",
|
|
"\n",
|
|
"wos = wos[((wos[\"Publication Year\"]<2023)&(wos[\"Publication Year\"]>2010))].copy()\n",
|
|
"print(f'Number of initial (valid interval) records: {len(wos)}')\n",
|
|
"\n",
|
|
"metrix = pd.read_excel(\"sm_journal_classification.xlsx\", sheet_name=\"Journal_Classification\")\n",
|
|
"\n",
|
|
"\n",
|
|
"metrix = metrix.set_index([c for c in metrix.columns if \"issn\" not in c]).stack().reset_index()\n",
|
|
"metrix = metrix.rename(columns={'level_6':\"issn_type\", 0:\"issn\"})\n",
|
|
"metrix[\"issn\"]=metrix[\"issn\"].str.replace(\"-\",\"\").str.lower().str.strip()\n",
|
|
"\n",
|
|
"wos[\"issn\"] = wos[\"ISSN\"].str.replace(\"-\",\"\").str.lower().str.strip()\n",
|
|
"wos[\"eissn\"] = wos[\"eISSN\"].str.replace(\"-\",\"\").str.lower().str.strip()\n",
|
|
"wos = wos.set_index([c for c in wos.columns if \"issn\" not in c]).stack().reset_index()\n",
|
|
"wos = wos.rename(columns={'level_71':\"issn_var\", 0:\"issn\"})\n",
|
|
"\n",
|
|
"wos_merge = wos.merge(metrix, on=\"issn\", how=\"left\")\n",
|
|
"\n",
|
|
"\n",
|
|
"\n",
|
|
"wos_indexed = wos_merge[~wos_merge[\"Domain_English\"].isna()]\n",
|
|
"wos_unindexed = wos_merge[~wos_merge[record_col].isin(wos_indexed[record_col])]\n",
|
|
"\n",
|
|
"\n",
|
|
"wos_unindexed = wos_unindexed.sort_values(by=[\"issn_var\"],ascending=False).drop_duplicates(subset=record_col)\n",
|
|
"wos = wos_indexed.sort_values(by=[\"issn_var\"],ascending=False).drop_duplicates(subset=record_col)\n",
|
|
"\n",
|
|
"wos_postmerge = wos.copy()\n",
|
|
"print(f'Number of METRIX filtered records: {len(wos)}')\n",
|
|
"print(f'Number of unindexed records: {len(wos_unindexed)}')\n",
|
|
"\n",
|
|
"# drop entries not indexed by metrix\n",
|
|
"# drop duplicates (based on doi)\n",
|
|
"wos = wos[~((~wos[\"DOI\"].isna())&(wos[\"DOI\"].duplicated(False)))]\n",
|
|
"wos = wos.drop_duplicates(subset=[\"Publication Type\",\"Document Type\",\"Authors\",\"Article Title\",\"Source Title\",\"Publication Year\"])\n",
|
|
"print(f'Number of filtered records (dropping duplicates): {len(wos)}')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"wos[\"Domain_English\"].value_counts()"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"wos_classifier = wos[[\"WoS Categories\",\"Research Areas\"]+list(metrix.columns)].copy().drop_duplicates()\n",
|
|
"wos_classifier = wos_classifier.groupby([\"WoS Categories\",\"Research Areas\"], as_index=False)[[\"Domain_English\",\"Field_English\",\"SubField_English\"]].agg(\n",
|
|
" lambda x: pd.Series.mode(x)[0])"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"wos_to_reindex = wos_unindexed.drop(columns=list(metrix.columns))\n",
|
|
"wos_found = wos_to_reindex.merge(wos_classifier, on=[\"WoS Categories\",\"Research Areas\"], how=\"inner\")\n",
|
|
"# wos_found = wos_to_reindex.merge(wos_classifier, on=\"Research Areas\", how=\"inner\")\n",
|
|
"# # wos_found = wos_to_reindex.merge(wos_classifier, on=\"WoS Categories\", how=\"inner\")\n",
|
|
"wos_stillost = wos_unindexed[~wos_unindexed[record_col].isin(wos_found[record_col])]\n",
|
|
"\n",
|
|
"print(\"Found:\", wos_found[record_col].nunique(),\"\\nLost forever:\", wos_stillost[record_col].nunique())"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"wos = pd.concat([wos,wos_found], ignore_index=True)\n",
|
|
"print(f'Number of records (after remerge): {len(wos)}')"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"wos[\"Domain_English\"].value_counts()"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"wos_cat = wos.groupby(record_col)[\"WoS Categories\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
|
|
"wos_cat[\"WoS Categories\"] = wos_cat[\"WoS Categories\"].str.strip()\n",
|
|
"wos_cat[\"WoS Categories\"].value_counts()"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"wos_subcat = wos_cat.copy()\n",
|
|
"wos_subcat[['WoS Category', 'WoS SubCategory']] = wos_subcat[\"WoS Categories\"].str.split(\",\", expand = True, n=1)\n",
|
|
"for c in ['WoS Category', 'WoS SubCategory',\"WoS Categories\"]:\n",
|
|
" wos_subcat[c] = wos_subcat[c].str.strip()\n",
|
|
"wos_subcat.drop_duplicates(subset=[record_col,'WoS Category'])[\"WoS Category\"].value_counts()"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"wos_areas = wos.groupby(record_col)[\"Research Areas\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
|
|
"wos_areas[\"Research Areas\"] = wos_areas[\"Research Areas\"].str.strip()\n",
|
|
"wos_areas[\"Research Areas\"].value_counts()"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"wos[[\"Article Title\",\"Keywords Plus\",\"Author Keywords\"]].sample(100)"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"kw_df = pd.DataFrame()\n",
|
|
"for c in [\"Keywords Plus\",\"Author Keywords\"]:\n",
|
|
" kwp = wos.groupby(record_col)[c].apply(lambda x: x.str.split(';')).explode().str.strip().str.upper()\n",
|
|
" kwp.name = 'keyword_all'\n",
|
|
" kw_df = pd.concat([kwp.reset_index(),kw_df],ignore_index=True)\n",
|
|
"kw_df = kw_df[~kw_df[\"keyword_all\"].isna()].copy().drop(columns=\"level_1\").drop_duplicates()\n",
|
|
"kw_df[\"keyword_all\"] = kw_df[\"keyword_all\"].apply(lambda x: re.sub(\"[\\(\\[].*?[\\)\\]]\", \"\", x))\n",
|
|
"kw_df.head(100)"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"wos_kwd_concat = kw_df.groupby(record_col, as_index=False).agg({'keyword_all': '; '.join})\n",
|
|
"wos_kwd_concat.head()"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"wos.columns"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"geotext = GeoText()\n",
|
|
"\n",
|
|
"def extract_location(input_text, key='countries'):\n",
|
|
" anomalies = {\"Malta\":\"Malta\",\n",
|
|
" \"Mongolia\":\"Mongolia\",\n",
|
|
" \"Quatar\":\"Qatar\",\n",
|
|
" \"Qatar\":\"Qatar\",\n",
|
|
" \"Ethiop\":\"Ethiopia\",\n",
|
|
" \"Nigeria\":\"Nigeria\",\n",
|
|
" \"BELAR\":\"Belarus\",\n",
|
|
" \"Venezuela\":\"Venezuela\",\n",
|
|
" \"Cyprus\":\"Cyprus\",\n",
|
|
" \"Ecuador\":\"Ecuador\",\n",
|
|
" \"U Arab\":\"United Arab Emirates\",\n",
|
|
" \"Syria\":\"Syria\",\n",
|
|
" \"Uganda\":\"Uganda\",\n",
|
|
" \"Yemen\":\"Yemen\",\n",
|
|
" \"Mali\":\"Mali\",\n",
|
|
" \"Senegal\":\"Senegal\",\n",
|
|
" \"Vatican\":\"Vatican\",\n",
|
|
" \"Uruguay\":\"Uruguay\",\n",
|
|
" \"Panama\":\"Panama\",\n",
|
|
" \"Fiji\":\"Fiji\",\n",
|
|
" \"Faroe\":\"Faroe Islands\",\n",
|
|
" \"Macedonia\":\"Macedonia\",\n",
|
|
" 'Mozambique':'Mozambique',\n",
|
|
" \"Kuwait\":\"Kuwait\",\n",
|
|
" \"Libya\":\"Libya\",\n",
|
|
" \"Turkiy\":\"Turkey\",\n",
|
|
" \"Liberia\":\"Liberia\",\n",
|
|
" \"Namibia\":\"Namibia\",\n",
|
|
" \"Ivoire\":\"Ivory Coast\",\n",
|
|
" \"Guatemala\":\"Gutemala\",\n",
|
|
" \"Paraguay\":\"Paraguay\",\n",
|
|
" \"Honduras\":\"Honduras\",\n",
|
|
" \"Nicaragua\":\"Nicaragua\",\n",
|
|
" \"Trinidad\":\"Trinidad & Tobago\",\n",
|
|
" \"Liechtenstein\":\"Liechtenstein\",\n",
|
|
" \"Greenland\":\"Denmark\"}\n",
|
|
"\n",
|
|
" extracted = geotext.extract(input_text=input_text)\n",
|
|
" found = extracted[key].keys()\n",
|
|
" if len(sorted(found))>0:\n",
|
|
" return sorted(found)[0]\n",
|
|
" elif key=='countries':\n",
|
|
" for i in ['Scotland','Wales','England', 'N Ireland']:\n",
|
|
" if i in input_text:\n",
|
|
" return 'United Kingdom'\n",
|
|
" for j in anomalies.keys():\n",
|
|
" if j in input_text:\n",
|
|
" return anomalies.get(j)\n",
|
|
" else:\n",
|
|
" return None\n",
|
|
"\n",
|
|
"with open('../eu_members.txt',\"r\") as f:\n",
|
|
" eu_countries=f.readline().split(\",\")\n",
|
|
" eu_countries=[i.strip() for i in eu_countries]\n",
|
|
"\n",
|
|
"def country_cleanup(country):\n",
|
|
" if \"USA\" in country:\n",
|
|
" return \"USA\"\n",
|
|
" elif \"China\" in country:\n",
|
|
" return \"China\"\n",
|
|
" elif country in [\"England\", \"Northern Ireland\", \"Wales\", \"Scotland\",\"N Ireland\"]:\n",
|
|
" return \"United Kingdom\"\n",
|
|
" else:\n",
|
|
" return country\n",
|
|
"\n",
|
|
"\n",
|
|
"def country_type(country):\n",
|
|
" if country in eu_countries:\n",
|
|
" return \"EU\"\n",
|
|
" elif country==\"China\":\n",
|
|
" return \"China\"\n",
|
|
" elif country in [\"Switzerland\", 'Norway','United Kingdom']:\n",
|
|
" return \"Non-EU associate\"\n",
|
|
" else:\n",
|
|
" return \"Other\"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"locations = wos.groupby(record_col)[\"Addresses\"].apply(lambda x: x.str.split('[')).explode().reset_index().drop(columns=\"level_1\")\n",
|
|
"\n",
|
|
"\n",
|
|
"locations = locations[locations[\"Addresses\"]!=\"\"].copy()\n",
|
|
"locations[\"Address\"] = locations[\"Addresses\"].apply(lambda x:x.split(\"]\")[-1])\n",
|
|
"locations[\"Authors_of_address\"] = locations[\"Addresses\"].apply(lambda x:x.split(\"]\")[0])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"len(locations)"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"locations[\"Address\"] = locations[\"Address\"].str.strip().str.strip(\";\")\n",
|
|
"locations = locations.groupby([record_col,\"Authors_of_address\"])[\"Address\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_2\")\n",
|
|
"locations.head(100)"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"# import dask.dataframe as dd\n",
|
|
"#\n",
|
|
"# locations_ddf = dd.from_pandas(locations, npartitions=4) # convert pandas DataFrame to Dask DataFrame\n",
|
|
"# loc_compute = locations_ddf.groupby([record_col,\"Authors_of_address\"])[\"Address\"].apply(lambda x: x.str.split(';')).explode().compute() # compute the result"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"# locations_test = locations.head(1000)\n",
|
|
"# locations_test = locations_test.groupby([record_col,\"Authors_of_address\"])[\"Address\"].str.split(';').explode()\n",
|
|
"# locations_test"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"\n",
|
|
"# locations[\"Country\"]=locations['Address'].apply(lambda x: extract_location(input_text=x, key='countries'))\n",
|
|
"locations[\"Country\"]=locations['Address'].apply(lambda x: x.split(\",\")[-1].strip(\" \").strip(\";\").strip(\" \"))\n",
|
|
"locations[\"Country\"]=locations['Country'].apply(lambda x: country_cleanup(x))\n",
|
|
"locations[\"City\"]=locations['Address'].apply(lambda x: extract_location(input_text=x, key='cities'))\n",
|
|
"locations[\"Country_Type\"] = locations[\"Country\"].apply(lambda x: country_type(x))"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"scope_types = [\"EU\",\"China\",\"Non-EU associate\"]\n",
|
|
"locations=locations[locations[\"Country_Type\"].isin(scope_types)]"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"univ_locations = locations[[record_col,\"Address\",\"Country\",\"City\",\"Country_Type\"]].copy()\n",
|
|
"univ_locations[\"Institution\"] = univ_locations[\"Address\"].apply(lambda x: x.split(\",\")[0])\n",
|
|
"univ_locations = univ_locations.drop_duplicates()\n",
|
|
"univ_locations.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"author_locations = locations.groupby([record_col,\"Country\",\"Country_Type\"])[\"Authors_of_address\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_3\")\n",
|
|
"author_locations[\"Author_name\"] = author_locations[\"Authors_of_address\"].str.strip()\n",
|
|
"author_locations = author_locations.drop(columns=\"Authors_of_address\")\n",
|
|
"author_locations[\"author_str_id\"] = author_locations[\"Author_name\"].apply(lambda x:''.join(filter(str.isalnum, x.lower())))\n",
|
|
"author_locations[\"author_str_id\"] = author_locations[\"author_str_id\"].apply(md5hash)\n",
|
|
"author_locations = author_locations.drop(columns=\"Author_name\")\n",
|
|
"author_locations.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"author_locations[author_locations['author_str_id'].duplicated(False)]"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"author_primary_region = author_locations.sort_values(by=\"Country_Type\").drop_duplicates(subset=[record_col,\"author_str_id\"])\n",
|
|
"# author_primary_region\n",
|
|
"\n",
|
|
"china=author_primary_region[author_primary_region[\"Country_Type\"]==\"China\"][record_col].unique()\n",
|
|
"eu=author_primary_region[author_primary_region[\"Country_Type\"]==\"EU\"][record_col].unique()\n",
|
|
"assoc=author_primary_region[author_primary_region[\"Country_Type\"]==\"Non-EU associate\"][record_col].unique()\n",
|
|
"\n",
|
|
"\n",
|
|
"# records that have distinct authors with different country affiliations\n",
|
|
"valid_scope = wos[((wos[record_col].isin(china))\n",
|
|
" &\n",
|
|
" ((wos[record_col].isin(eu))\n",
|
|
" |\n",
|
|
" (wos[record_col].isin(assoc))))][record_col].unique()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"author_primary_region.head()"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"print(f'Number of records: {len(wos)}')\n",
|
|
"print(f'Number of valid cooperation records: {len(valid_scope)}')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"wos = wos[wos[record_col].isin(valid_scope)]\n",
|
|
"locations = locations[locations[record_col].isin(valid_scope)]\n",
|
|
"univ_locations = univ_locations[univ_locations[record_col].isin(valid_scope)]\n",
|
|
"author_locations = author_locations[author_locations[record_col].isin(valid_scope)]\n",
|
|
"author_primary_region = author_locations[author_locations[record_col].isin(valid_scope)]"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"affiliations = wos.groupby(record_col)[\"Affiliations\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
|
|
"affiliations[\"Affiliations\"] = affiliations[\"Affiliations\"].str.strip().str.upper().fillna(\"UNKNOWN\")\n",
|
|
"affiliations = affiliations.drop_duplicates()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"affiliations[\"Affiliations\"].value_counts()"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"univ_locations[\"Institution\"].value_counts()"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"univ_locations[record_col].nunique()"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"affiliations[record_col].nunique()"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"univ_locations[\"Institution\"].value_counts().sum()"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"affiliations[\"Affiliations\"].value_counts().sum()"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"wos_cat = wos.groupby(record_col)[\"WoS Categories\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
|
|
"wos_cat[\"WoS Categories\"].value_counts()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"wos_areas = wos.groupby(record_col)[\"Research Areas\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
|
|
"wos_areas[\"Research Areas\"] = wos_areas[\"Research Areas\"].str.strip()\n",
|
|
"wos_areas[\"Research Areas\"].value_counts()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"[c for c in wos.columns if \"_English\" in c]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"metrix_levels = [c for c in wos.columns if \"_English\" in c]\n",
|
|
"for m in metrix_levels:\n",
|
|
" wos[m] = wos[m].replace({\"article-level classification\":\"Multidisciplinary\"})\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"wos"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"metrix_levels"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"record_countries = locations[[record_col,\"Country\"]].drop_duplicates()\n",
|
|
"record_author_locations = author_locations[[record_col,\"author_str_id\",\"Country\"]].drop_duplicates()\n",
|
|
"record_institution = univ_locations[[record_col,\"Institution\",\"Country\"]].drop_duplicates()\n",
|
|
"country_types = locations[[\"Country\",\"Country_Type\"]].drop_duplicates()"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"# Basic network layout"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"country_collabs = record_countries.merge(record_countries, on=record_col)\n",
|
|
"country_collabs = country_collabs[country_collabs[\"Country_x\"]!=country_collabs[\"Country_y\"]]\n",
|
|
"country_collabs[\"weight\"] = 0.5"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"inst_collabs = record_institution.merge(record_institution, on=record_col)\n",
|
|
"inst_collabs = inst_collabs[inst_collabs[\"Institution_x\"]!=inst_collabs[\"Institution_y\"]]\n",
|
|
"inst_collabs[\"weight\"] = 0.5"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"wos.columns"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"drop_cols = [ws for ws in wos.columns if ((\"uthor\" in ws or \"ddress\" in ws or \"ORCID\" in\n",
|
|
" ws or \"esearcher\" in ws or \"ditor\" in ws or \"name\" in ws or 'SEQ' in ws) and \"eyword\" not in ws)]\n",
|
|
"drop_cols"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"outdir=\"wos_processed_data\""
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"os.makedirs(outdir, exist_ok=True)\n",
|
|
"\n",
|
|
"wos.drop(columns=drop_cols).to_excel(f\"{outdir}/wos_processed.xlsx\", index=False)\n",
|
|
"\n",
|
|
"record_countries.to_excel(f\"{outdir}/wos_countries.xlsx\", index=False)\n",
|
|
"\n",
|
|
"record_author_locations.to_excel(f\"{outdir}/wos_author_locations.xlsx\", index=False)\n",
|
|
"\n",
|
|
"record_institution.to_excel(f\"{outdir}/wos_institution_locations.xlsx\", index=False)\n",
|
|
"\n",
|
|
"kw_df.to_excel(f\"{outdir}/wos_keywords.xlsx\", index=False)\n",
|
|
"\n",
|
|
"country_types.to_excel(f\"{outdir}/wos_country_types.xlsx\", index=False)"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"wos.drop(columns=drop_cols).to_csv(f\"{outdir}/wos_processed.csv\", index=False, sep='\\t')\n",
|
|
"\n",
|
|
"record_countries.to_csv(f\"{outdir}/wos_countries.csv\", index=False, sep='\\t')\n",
|
|
"\n",
|
|
"record_author_locations.to_csv(f\"{outdir}/wos_author_locations.csv\", index=False, sep='\\t')\n",
|
|
"\n",
|
|
"record_institution.to_csv(f\"{outdir}/wos_institution_locations.csv\", index=False, sep='\\t')\n",
|
|
"\n",
|
|
"kw_df.to_csv(f\"{outdir}/wos_keywords.csv\", index=False, sep='\\t')\n",
|
|
"\n",
|
|
"country_types.to_csv(f\"{outdir}/wos_country_types.csv\", index=False, sep='\\t')\n",
|
|
"\n",
|
|
"inst_collabs.to_csv(f\"{outdir}/wos_inst_collabs.csv\", index=False, sep='\\t')\n",
|
|
"\n",
|
|
"country_collabs.to_csv(f\"{outdir}/wos_country_collabs.csv\", index=False, sep='\\t')"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"wos_areas.to_csv(f\"{outdir}/wos_research_areas.csv\", index=False, sep='\\t')\n",
|
|
"\n",
|
|
"wos_subcat.to_csv(f\"{outdir}/wos_categories.csv\", index=False, sep='\\t')"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.9.16"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 1
|
|
}
|