{ "cells": [ { "cell_type": "code", "execution_count": 44, "id": "a8be6839", "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "import janitor\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from matplotlib.ticker import MaxNLocator\n", "import math\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 45, "id": "211ba466", "metadata": {}, "outputs": [], "source": [ "outdir=\"WESTERN_CH_scope\"\n", "\n", "appln = pd.read_csv(f\"{outdir}/tls_201_scope.csv\")\n", "\n", "appln_title = pd.read_csv(f\"{outdir}/tls_202_scope.csv\")\n", "\n", "pers = pd.read_csv(f\"{outdir}/tls_206_scope.csv\")\n", "\n", "appln_pers = pd.read_csv(f\"{outdir}/tls_207_scope.csv\")\n", "\n", "appln_cpc = pd.read_csv(f\"{outdir}/tls_224_scope.csv\")" ] }, { "cell_type": "code", "execution_count": 46, "id": "f878b151", "metadata": {}, "outputs": [], "source": [ "# workdir_path=r\"CPCTitleList202302\"\n", "# # outfile='wos_extract_complete.csv'\n", "# # with_header=True\n", "# cpc_ids = pd.DataFrame()\n", "# for root, dirs, files in os.walk(workdir_path):\n", "# for filename in files:\n", "# path=os.path.join(root, filename)\n", "# section = pd.read_csv(path, sep='\\t', header=None)\n", "# cpc_ids=pd.concat([cpc_ids,section], ignore_index=True)\n", "# cpc_ids.columns =[\"cpc_id\",\"idk\",\"cpc_name\"]\n", "# cpc_ids = cpc_ids.drop(columns=\"idk\")" ] }, { "cell_type": "code", "execution_count": 47, "id": "95ea20da", "metadata": {}, "outputs": [], "source": [ "parsed = {x: [] for x in ['code', 'title', 'section', 'class', 'subclass', 'group', 'main_group']}\n", "for letter in 'ABCDEFGHY':\n", " file = f'CPC_data/CPCTitleList202302/cpc-section-{letter}_20230201.txt'\n", " with open(file) as f:\n", " for line in f:\n", " vals = line.strip().split('\\t')\n", " if len(vals) == 2:\n", " parsed['code'].append(vals[0])\n", " parsed['title'].append(vals[1])\n", " elif len(vals) == 3:\n", " parsed['code'].append(vals[0])\n", " parsed['title'].append(vals[2])\n", "\n", "\n", "\n", "for i in range(len(parsed['code'])):\n", " code = parsed['code'][i]\n", " main_group = code.split('/')[-1] if \"/\" in code else None\n", " group = code.split('/')[0][4:] if len(code) >= 5 else None\n", " subclass = code[3] if len(code) >= 4 else None\n", " class_ = code[1:3] if len(code) >= 3 else None\n", " section = code[0] if len(code) >= 1 else None\n", " \n", " parsed['main_group'].append(main_group)\n", " parsed['group'].append(group)\n", " parsed['subclass'].append(subclass)\n", " parsed['class'].append(class_)\n", " parsed['section'].append(section)\n", "\n", "cpc_ids2023 = pd.DataFrame.from_dict(parsed)\n", "cpc_ids2023['cpc_version']=2023\n", "cpc_ids2022 = pd.read_csv(\"CPC_data/cpc_titles_2022.csv\")\n", "cpc_ids2022['cpc_version']=2022\n", "cpc_ids = pd.concat([cpc_ids2023,cpc_ids2022], ignore_index=True)\n", "cpc_ids = cpc_ids.rename(columns={\"code\":\"cpc_id\",\"title\":\"cpc_name\"}).drop_duplicates(subset=\"cpc_id\")" ] }, { "cell_type": "code", "execution_count": 47, "id": "907d9c3e", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 48, "id": "1be8971a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "70 cpc_ids not found\n", "0.07344840249724569 % lost\n" ] } ], "source": [ "appln_cpc[\"cpc_id\"] = appln_cpc[\"cpc_class_symbol\"].str.replace(\" \",\"\")\n", "appln_cpc_tax = appln_cpc.merge(cpc_ids, on=\"cpc_id\", how=\"left\")\n", "\n", "print(len(appln_cpc_tax[appln_cpc_tax[\"cpc_name\"].isna()][\"cpc_id\"].unique()), \"cpc_ids not found\")\n", "print(len(appln_cpc_tax[appln_cpc_tax[\"cpc_name\"].isna()][\"cpc_id\"].unique())/len(appln_cpc_tax[\"cpc_id\"].unique())*100, \"% lost\")" ] }, { "cell_type": "code", "execution_count": 49, "id": "b1274c34", "metadata": {}, "outputs": [], "source": [ "cpc_dict = dict(zip(cpc_ids.cpc_id.str.replace(\" \",\"\"), cpc_ids.cpc_name))\n", "# cpc_dict" ] }, { "cell_type": "code", "execution_count": 50, "id": "2a7e39ee", "metadata": {}, "outputs": [], "source": [ "def cpc_classifier(id_text):\n", " taxonomy = []\n", " iter_text = id_text.replace(\" \",\"\")\n", " for i in range(len(iter_text)+1):\n", " tax_id = iter_text[:i]\n", " tax_name = cpc_dict.get(iter_text[:i])\n", " if tax_name:\n", " taxonomy.append((tax_id,tax_name))\n", " return taxonomy\n", " " ] }, { "cell_type": "code", "execution_count": 51, "id": "e31a013f", "metadata": {}, "outputs": [ { "data": { "text/plain": "[('A', 'HUMAN NECESSITIES'),\n ('A01',\n 'AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTING; TRAPPING; FISHING'),\n ('A01B',\n 'SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS, DETAILS, OR ACCESSORIES OF AGRICULTURAL MACHINES OR IMPLEMENTS, IN GENERAL (making or covering furrows or holes for sowing, planting, or manuring A01C5/00; soil working for engineering purposes E01, E02, E21; {measuring areas for agricultural purposes G01B})'),\n ('A01B1/06',\n 'Hoes; Hand cultivators {(rakes A01D7/00; forks A01D9/00; picks B25D)}'),\n ('A01B1/065', '{powered}')]" }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cpc_classifier(\"A01B1/065\")" ] }, { "cell_type": "code", "execution_count": 52, "id": "f09a616c", "metadata": {}, "outputs": [ { "data": { "text/plain": " cpc_id cpc_name section class \n0 A HUMAN NECESSITIES A None \\\n1 A01 AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI... A 01 \n2 A01B SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS... A 01 \n3 A01B1/00 Hand tools (edge trimmers for lawns A01G3/06 ... A 01 \n4 A01B1/02 Spades; Shovels {(hand-operated dredgers E02F3... A 01 \n\n subclass group main_group cpc_version \n0 None None None 2023 \\\n1 None None None 2023 \n2 B None None 2023 \n3 B 1 00 2023 \n4 B 1 02 2023 \n\n version https://git-lfs.github.com/spec/v1 \n0 NaN \\\n1 NaN \n2 NaN \n3 NaN \n4 NaN \n\n cpc_taxonomy \n0 [(A, HUMAN NECESSITIES)] \n1 [(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... \n2 [(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... \n3 [(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... \n4 [(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... ", "text/html": "
\n | cpc_id | \ncpc_name | \nsection | \nclass | \nsubclass | \ngroup | \nmain_group | \ncpc_version | \nversion https://git-lfs.github.com/spec/v1 | \ncpc_taxonomy | \n
---|---|---|---|---|---|---|---|---|---|---|
0 | \nA | \nHUMAN NECESSITIES | \nA | \nNone | \nNone | \nNone | \nNone | \n2023 | \nNaN | \n[(A, HUMAN NECESSITIES)] | \n
1 | \nA01 | \nAGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI... | \nA | \n01 | \nNone | \nNone | \nNone | \n2023 | \nNaN | \n[(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... | \n
2 | \nA01B | \nSOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS... | \nA | \n01 | \nB | \nNone | \nNone | \n2023 | \nNaN | \n[(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... | \n
3 | \nA01B1/00 | \nHand tools (edge trimmers for lawns A01G3/06 ... | \nA | \n01 | \nB | \n1 | \n00 | \n2023 | \nNaN | \n[(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... | \n
4 | \nA01B1/02 | \nSpades; Shovels {(hand-operated dredgers E02F3... | \nA | \n01 | \nB | \n1 | \n02 | \n2023 | \nNaN | \n[(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... | \n
\n | cpc_id | \ncpc_name | \nsection | \nclass | \nsubclass | \ngroup | \nmain_group | \ncpc_version | \nversion https://git-lfs.github.com/spec/v1 | \ncpc_taxonomy | \ncpc_fullname | \ntax_level_0 | \ntax_level_1 | \ntax_level_2 | \ntax_level_3 | \ntax_level_4 | \ntax_level_5 | \ntax_level_6 | \ntax_level_7 | \n
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
12725 | \nA61B1/000096 | \n{using artificial intelligence} | \nA | \n61 | \nB | \n1 | \n000096 | \n2023 | \nNaN | \n[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... | \nHUMAN NECESSITIES<>MEDICAL OR VETERINARY SCIEN... | \nHUMAN NECESSITIES | \nMEDICAL OR VETERINARY SCIENCE; HYGIENE | \nDIAGNOSIS; SURGERY; IDENTIFICATION (analysing ... | \nInstruments for performing medical examination... | \n{of image signals during a use of endoscope} | \n{using artificial intelligence} | \nNone | \nNone | \n
13764 | \nA61B5/7264 | \n{Classification of physiological signals or da... | \nA | \n61 | \nB | \n5 | \n7264 | \n2023 | \nNaN | \n[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... | \nHUMAN NECESSITIES<>MEDICAL OR VETERINARY SCIEN... | \nHUMAN NECESSITIES | \nMEDICAL OR VETERINARY SCIENCE; HYGIENE | \nDIAGNOSIS; SURGERY; IDENTIFICATION (analysing ... | \n{Signal processing specially adapted for physi... | \n{using Wavelet transforms} | \n{Classification of physiological signals or da... | \nNone | \nNone | \n
13897 | \nA61B6/52 | \n{Devices using data or image processing specia... | \nA | \n61 | \nB | \n6 | \n52 | \n2023 | \nNaN | \n[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... | \nHUMAN NECESSITIES<>MEDICAL OR VETERINARY SCIEN... | \nHUMAN NECESSITIES | \nMEDICAL OR VETERINARY SCIENCE; HYGIENE | \nDIAGNOSIS; SURGERY; IDENTIFICATION (analysing ... | \n{Devices using data or image processing specia... | \nNone | \nNone | \nNone | \nNone | \n
14016 | \nA61B8/52 | \n{Devices using data or image processing specia... | \nA | \n61 | \nB | \n8 | \n52 | \n2023 | \nNaN | \n[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... | \nHUMAN NECESSITIES<>MEDICAL OR VETERINARY SCIEN... | \nHUMAN NECESSITIES | \nMEDICAL OR VETERINARY SCIENCE; HYGIENE | \nDIAGNOSIS; SURGERY; IDENTIFICATION (analysing ... | \n{Devices using data or image processing specia... | \nNone | \nNone | \nNone | \nNone | \n
15252 | \nA61B2018/0069 | \n{using fuzzy logic} | \nA | \n61 | \nB | \n2018 | \n0069 | \n2023 | \nNaN | \n[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... | \nHUMAN NECESSITIES<>MEDICAL OR VETERINARY SCIEN... | \nHUMAN NECESSITIES | \nMEDICAL OR VETERINARY SCIENCE; HYGIENE | \nDIAGNOSIS; SURGERY; IDENTIFICATION (analysing ... | \n{using fuzzy logic} | \nNone | \nNone | \nNone | \nNone | \n
... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n
250685 | \nY10S707/99946 | \nObject-oriented database structure network | \nY | \n10 | \nS | \n707 | \n99946 | \n2023 | \nNaN | \n[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... | \nGENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... | \nGENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... | \nTECHNICAL SUBJECTS COVERED BY FORMER USPC | \nTECHNICAL SUBJECTS COVERED BY FORMER USPC CROS... | \nDistributed or remote access | \nObject-oriented database structure network | \nNone | \nNone | \nNone | \n
250686 | \nY10S707/99947 | \nObject-oriented database structure reference | \nY | \n10 | \nS | \n707 | \n99947 | \n2023 | \nNaN | \n[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... | \nGENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... | \nGENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... | \nTECHNICAL SUBJECTS COVERED BY FORMER USPC | \nTECHNICAL SUBJECTS COVERED BY FORMER USPC CROS... | \nDistributed or remote access | \nObject-oriented database structure reference | \nNone | \nNone | \nNone | \n
250687 | \nY10S707/99948 | \nApplication of database or data structure, e.g... | \nY | \n10 | \nS | \n707 | \n99948 | \n2023 | \nNaN | \n[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... | \nGENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... | \nGENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... | \nTECHNICAL SUBJECTS COVERED BY FORMER USPC | \nTECHNICAL SUBJECTS COVERED BY FORMER USPC CROS... | \nDistributed or remote access | \nApplication of database or data structure, e.g... | \nNone | \nNone | \nNone | \n
250688 | \nY10S707/99951 | \nFile or database maintenance | \nY | \n10 | \nS | \n707 | \n99951 | \n2023 | \nNaN | \n[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... | \nGENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... | \nGENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... | \nTECHNICAL SUBJECTS COVERED BY FORMER USPC | \nTECHNICAL SUBJECTS COVERED BY FORMER USPC CROS... | \nFile or database maintenance | \nNone | \nNone | \nNone | \nNone | \n
250703 | \nY10S715/968 | \ninterface for database querying and retrieval | \nY | \n10 | \nS | \n715 | \n968 | \n2023 | \nNaN | \n[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... | \nGENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... | \nGENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... | \nTECHNICAL SUBJECTS COVERED BY FORMER USPC | \nTECHNICAL SUBJECTS COVERED BY FORMER USPC CROS... | \ninterface for database querying and retrieval | \nNone | \nNone | \nNone | \nNone | \n
317 rows × 19 columns
\n\n | cpc_id | \ncpc_name | \nsection | \nclass | \nsubclass | \ngroup | \nmain_group | \ncpc_version | \nversion https://git-lfs.github.com/spec/v1 | \ncpc_taxonomy | \ncpc_fullname | \ntax_level_0 | \ntax_level_1 | \ntax_level_2 | \ntax_level_3 | \ntax_level_4 | \ntax_level_5 | \ntax_level_6 | \ntax_level_7 | \n
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
12725 | \nA61B1/000096 | \n{using artificial intelligence} | \nA | \n61 | \nB | \n1 | \n000096 | \n2023 | \nNaN | \n[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... | \nHUMAN NECESSITIES<>MEDICAL OR VETERINARY SCIEN... | \nHUMAN NECESSITIES | \nMEDICAL OR VETERINARY SCIENCE; HYGIENE | \nDIAGNOSIS; SURGERY; IDENTIFICATION (analysing ... | \nInstruments for performing medical examination... | \n{of image signals during a use of endoscope} | \n{using artificial intelligence} | \nNone | \nNone | \n
13746 | \nA61B5/72 | \n{Signal processing specially adapted for physi... | \nA | \n61 | \nB | \n5 | \n72 | \n2023 | \nNaN | \n[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... | \nHUMAN NECESSITIES<>MEDICAL OR VETERINARY SCIEN... | \nHUMAN NECESSITIES | \nMEDICAL OR VETERINARY SCIENCE; HYGIENE | \nDIAGNOSIS; SURGERY; IDENTIFICATION (analysing ... | \n{Signal processing specially adapted for physi... | \nNone | \nNone | \nNone | \nNone | \n
13764 | \nA61B5/7264 | \n{Classification of physiological signals or da... | \nA | \n61 | \nB | \n5 | \n7264 | \n2023 | \nNaN | \n[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... | \nHUMAN NECESSITIES<>MEDICAL OR VETERINARY SCIEN... | \nHUMAN NECESSITIES | \nMEDICAL OR VETERINARY SCIENCE; HYGIENE | \nDIAGNOSIS; SURGERY; IDENTIFICATION (analysing ... | \n{Signal processing specially adapted for physi... | \n{using Wavelet transforms} | \n{Classification of physiological signals or da... | \nNone | \nNone | \n
13897 | \nA61B6/52 | \n{Devices using data or image processing specia... | \nA | \n61 | \nB | \n6 | \n52 | \n2023 | \nNaN | \n[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... | \nHUMAN NECESSITIES<>MEDICAL OR VETERINARY SCIEN... | \nHUMAN NECESSITIES | \nMEDICAL OR VETERINARY SCIENCE; HYGIENE | \nDIAGNOSIS; SURGERY; IDENTIFICATION (analysing ... | \n{Devices using data or image processing specia... | \nNone | \nNone | \nNone | \nNone | \n
14016 | \nA61B8/52 | \n{Devices using data or image processing specia... | \nA | \n61 | \nB | \n8 | \n52 | \n2023 | \nNaN | \n[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... | \nHUMAN NECESSITIES<>MEDICAL OR VETERINARY SCIEN... | \nHUMAN NECESSITIES | \nMEDICAL OR VETERINARY SCIENCE; HYGIENE | \nDIAGNOSIS; SURGERY; IDENTIFICATION (analysing ... | \n{Devices using data or image processing specia... | \nNone | \nNone | \nNone | \nNone | \n
... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n
246159 | \nY10S128/924 | \nusing artificial intelligence | \nY | \n10 | \nS | \n128 | \n924 | \n2023 | \nNaN | \n[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... | \nGENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... | \nGENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... | \nTECHNICAL SUBJECTS COVERED BY FORMER USPC | \nTECHNICAL SUBJECTS COVERED BY FORMER USPC CROS... | \nComputer assisted medical diagnostics | \nusing artificial intelligence | \nNone | \nNone | \nNone | \n
246160 | \nY10S128/925 | \nNeural network | \nY | \n10 | \nS | \n128 | \n925 | \n2023 | \nNaN | \n[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... | \nGENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... | \nGENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... | \nTECHNICAL SUBJECTS COVERED BY FORMER USPC | \nTECHNICAL SUBJECTS COVERED BY FORMER USPC CROS... | \nComputer assisted medical diagnostics | \nNeural network | \nNone | \nNone | \nNone | \n
248454 | \nY10S323/909 | \nRemote sensing | \nY | \n10 | \nS | \n323 | \n909 | \n2023 | \nNaN | \n[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... | \nGENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... | \nGENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... | \nTECHNICAL SUBJECTS COVERED BY FORMER USPC | \nTECHNICAL SUBJECTS COVERED BY FORMER USPC CROS... | \nRemote sensing | \nNone | \nNone | \nNone | \nNone | \n
250570 | \nY10S706/00 | \nData processing: artificial intelligence | \nY | \n10 | \nS | \n706 | \n00 | \n2023 | \nNaN | \n[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... | \nGENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... | \nGENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... | \nTECHNICAL SUBJECTS COVERED BY FORMER USPC | \nTECHNICAL SUBJECTS COVERED BY FORMER USPC CROS... | \nData processing: artificial intelligence | \nNone | \nNone | \nNone | \nNone | \n
250571 | \nY10S706/90 | \nFuzzy logic | \nY | \n10 | \nS | \n706 | \n90 | \n2023 | \nNaN | \n[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... | \nGENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... | \nGENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... | \nTECHNICAL SUBJECTS COVERED BY FORMER USPC | \nTECHNICAL SUBJECTS COVERED BY FORMER USPC CROS... | \nFuzzy logic | \nNone | \nNone | \nNone | \nNone | \n
358 rows × 19 columns
\n