You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ZSI_Reconnect_China/PATSTAT/patstat_cpc_parse.ipynb

388 lines
52 KiB
Plaintext

2 years ago
{
"cells": [
{
"cell_type": "code",
"execution_count": 44,
2 years ago
"id": "a8be6839",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import pandas as pd\n",
"import janitor\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from matplotlib.ticker import MaxNLocator\n",
"import math\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 45,
2 years ago
"id": "211ba466",
"metadata": {},
"outputs": [],
"source": [
"outdir=\"WESTERN_CH_scope\"\n",
2 years ago
"\n",
"appln = pd.read_csv(f\"{outdir}/tls_201_scope.csv\")\n",
"\n",
"appln_title = pd.read_csv(f\"{outdir}/tls_202_scope.csv\")\n",
"\n",
"pers = pd.read_csv(f\"{outdir}/tls_206_scope.csv\")\n",
"\n",
"appln_pers = pd.read_csv(f\"{outdir}/tls_207_scope.csv\")\n",
"\n",
"appln_cpc = pd.read_csv(f\"{outdir}/tls_224_scope.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 46,
2 years ago
"id": "f878b151",
"metadata": {},
"outputs": [],
"source": [
"# workdir_path=r\"CPCTitleList202302\"\n",
"# # outfile='wos_extract_complete.csv'\n",
"# # with_header=True\n",
"# cpc_ids = pd.DataFrame()\n",
"# for root, dirs, files in os.walk(workdir_path):\n",
"# for filename in files:\n",
"# path=os.path.join(root, filename)\n",
"# section = pd.read_csv(path, sep='\\t', header=None)\n",
"# cpc_ids=pd.concat([cpc_ids,section], ignore_index=True)\n",
"# cpc_ids.columns =[\"cpc_id\",\"idk\",\"cpc_name\"]\n",
"# cpc_ids = cpc_ids.drop(columns=\"idk\")"
]
},
{
"cell_type": "code",
"execution_count": 47,
2 years ago
"id": "95ea20da",
"metadata": {},
"outputs": [],
"source": [
"parsed = {x: [] for x in ['code', 'title', 'section', 'class', 'subclass', 'group', 'main_group']}\n",
"for letter in 'ABCDEFGHY':\n",
" file = f'CPC_data/CPCTitleList202302/cpc-section-{letter}_20230201.txt'\n",
2 years ago
" with open(file) as f:\n",
" for line in f:\n",
" vals = line.strip().split('\\t')\n",
" if len(vals) == 2:\n",
" parsed['code'].append(vals[0])\n",
" parsed['title'].append(vals[1])\n",
" elif len(vals) == 3:\n",
" parsed['code'].append(vals[0])\n",
" parsed['title'].append(vals[2])\n",
"\n",
"\n",
"\n",
"for i in range(len(parsed['code'])):\n",
" code = parsed['code'][i]\n",
" main_group = code.split('/')[-1] if \"/\" in code else None\n",
" group = code.split('/')[0][4:] if len(code) >= 5 else None\n",
" subclass = code[3] if len(code) >= 4 else None\n",
" class_ = code[1:3] if len(code) >= 3 else None\n",
" section = code[0] if len(code) >= 1 else None\n",
" \n",
" parsed['main_group'].append(main_group)\n",
" parsed['group'].append(group)\n",
" parsed['subclass'].append(subclass)\n",
" parsed['class'].append(class_)\n",
" parsed['section'].append(section)\n",
"\n",
"cpc_ids2023 = pd.DataFrame.from_dict(parsed)\n",
"cpc_ids2023['cpc_version']=2023\n",
"cpc_ids2022 = pd.read_csv(\"CPC_data/cpc_titles_2022.csv\")\n",
"cpc_ids2022['cpc_version']=2022\n",
"cpc_ids = pd.concat([cpc_ids2023,cpc_ids2022], ignore_index=True)\n",
"cpc_ids = cpc_ids.rename(columns={\"code\":\"cpc_id\",\"title\":\"cpc_name\"}).drop_duplicates(subset=\"cpc_id\")"
]
},
{
"cell_type": "code",
"execution_count": 47,
2 years ago
"id": "907d9c3e",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 48,
2 years ago
"id": "1be8971a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"70 cpc_ids not found\n",
"0.07344840249724569 % lost\n"
2 years ago
]
}
],
"source": [
"appln_cpc[\"cpc_id\"] = appln_cpc[\"cpc_class_symbol\"].str.replace(\" \",\"\")\n",
"appln_cpc_tax = appln_cpc.merge(cpc_ids, on=\"cpc_id\", how=\"left\")\n",
"\n",
"print(len(appln_cpc_tax[appln_cpc_tax[\"cpc_name\"].isna()][\"cpc_id\"].unique()), \"cpc_ids not found\")\n",
"print(len(appln_cpc_tax[appln_cpc_tax[\"cpc_name\"].isna()][\"cpc_id\"].unique())/len(appln_cpc_tax[\"cpc_id\"].unique())*100, \"% lost\")"
]
},
{
"cell_type": "code",
"execution_count": 49,
2 years ago
"id": "b1274c34",
"metadata": {},
"outputs": [],
"source": [
"cpc_dict = dict(zip(cpc_ids.cpc_id.str.replace(\" \",\"\"), cpc_ids.cpc_name))\n",
"# cpc_dict"
]
},
{
"cell_type": "code",
"execution_count": 50,
2 years ago
"id": "2a7e39ee",
"metadata": {},
"outputs": [],
"source": [
"def cpc_classifier(id_text):\n",
" taxonomy = []\n",
" iter_text = id_text.replace(\" \",\"\")\n",
" for i in range(len(iter_text)+1):\n",
" tax_id = iter_text[:i]\n",
" tax_name = cpc_dict.get(iter_text[:i])\n",
" if tax_name:\n",
" taxonomy.append((tax_id,tax_name))\n",
" return taxonomy\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 51,
2 years ago
"id": "e31a013f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "[('A', 'HUMAN NECESSITIES'),\n ('A01',\n 'AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTING; TRAPPING; FISHING'),\n ('A01B',\n 'SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS, DETAILS, OR ACCESSORIES OF AGRICULTURAL MACHINES OR IMPLEMENTS, IN GENERAL (making or covering furrows or holes for sowing, planting, or manuring A01C5/00; soil working for engineering purposes E01, E02, E21; {measuring areas for agricultural purposes G01B})'),\n ('A01B1/06',\n 'Hoes; Hand cultivators {(rakes A01D7/00; forks A01D9/00; picks B25D)}'),\n ('A01B1/065', '{powered}')]"
},
"execution_count": 51,
2 years ago
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cpc_classifier(\"A01B1/065\")"
]
},
{
"cell_type": "code",
"execution_count": 52,
2 years ago
"id": "f09a616c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": " cpc_id cpc_name section class \n0 A HUMAN NECESSITIES A None \\\n1 A01 AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI... A 01 \n2 A01B SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS... A 01 \n3 A01B1/00 Hand tools (edge trimmers for lawns A01G3/06 ... A 01 \n4 A01B1/02 Spades; Shovels {(hand-operated dredgers E02F3... A 01 \n\n subclass group main_group cpc_version \n0 None None None 2023 \\\n1 None None None 2023 \n2 B None None 2023 \n3 B 1 00 2023 \n4 B 1 02 2023 \n\n version https://git-lfs.github.com/spec/v1 \n0 NaN \\\n1 NaN \n2 NaN \n3 NaN \n4 NaN \n\n cpc_taxonomy \n0 [(A, HUMAN NECESSITIES)] \n1 [(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... \n2 [(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... \n3 [(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... \n4 [(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>cpc_id</th>\n <th>cpc_name</th>\n <th>section</th>\n <th>class</th>\n <th>subclass</th>\n <th>group</th>\n <th>main_group</th>\n <th>cpc_version</th>\n <th>version https://git-lfs.github.com/spec/v1</th>\n <th>cpc_taxonomy</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>A</td>\n <td>HUMAN NECESSITIES</td>\n <td>A</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES)]</td>\n </tr>\n <tr>\n <th>1</th>\n <td>A01</td>\n <td>AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI...</td>\n <td>A</td>\n <td>01</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO...</td>\n </tr>\n <tr>\n <th>2</th>\n <td>A01B</td>\n <td>SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS...</td>\n <td>A</td>\n <td>01</td>\n <td>B</td>\n <td>None</td>\n <td>None</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>A01B1/00</td>\n <td>Hand tools (edge trimmers for lawns A01G3/06 ...</td>\n <td>A</td>\n <td>01</td>\n <td>B</td>\n <td>1</td>\n <td>00</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO...</td>\n </tr>\n <tr>\n <th>4</th>\n <td>A01B1/02</td>\n <td>Spades; Shovels {(hand-operated dredgers E02F3...</td>\n <td>A</td>\n <td>01</td>\n <td>B</td>\n <td>1</td>\n <td>02</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO...</td>\n </tr>\n </tbody>\n</table>\n</div>"
2 years ago
},
"execution_count": 52,
2 years ago
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cpc_ids[\"cpc_taxonomy\"] = cpc_ids[\"cpc_id\"].fillna(\"\").map(cpc_classifier)\n",
"cpc_ids.head()\n"
2 years ago
]
},
{
"cell_type": "code",
"execution_count": 53,
2 years ago
"id": "f3fa8bf3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"70 cpc_ids not found\n",
"0.07344840249724569 % lost\n"
2 years ago
]
}
],
"source": [
"appln_cpc[\"cpc_id\"] = appln_cpc[\"cpc_class_symbol\"].str.replace(\" \",\"\")\n",
"appln_cpc_tax = appln_cpc.merge(cpc_ids, on=\"cpc_id\", how=\"left\")\n",
"print(len(appln_cpc_tax[appln_cpc_tax[\"cpc_name\"].isna()][\"cpc_id\"].unique()), \"cpc_ids not found\")\n",
"print(len(appln_cpc_tax[appln_cpc_tax[\"cpc_name\"].isna()][\"cpc_id\"].unique())/len(appln_cpc_tax[\"cpc_id\"].unique())*100, \"% lost\")"
]
},
{
"cell_type": "code",
"execution_count": 54,
2 years ago
"id": "58701721",
"metadata": {},
"outputs": [],
"source": [
"# cpc_ids\n",
"cpc_ids[\"cpc_fullname\"] = cpc_ids[\"cpc_taxonomy\"].apply(lambda x: \"<>\".join([y[1] for y in x]))\n",
"cpc_ids.sample(100)\n",
"colnames = [\"tax_level_\"+ str(i) for i in cpc_ids[\"cpc_fullname\"].str.split('<>', expand=True).columns]\n",
"cpc_ids[colnames] = cpc_ids[\"cpc_fullname\"].str.split('<>', expand=True)\n",
"# cpc_ids"
2 years ago
]
},
{
"cell_type": "markdown",
"id": "ca631acf",
"metadata": {},
"source": [
"## 'AI/Big Data' keywords"
2 years ago
]
},
{
"cell_type": "code",
"execution_count": 55,
"outputs": [
{
"data": {
"text/plain": "'neural network|machine learn|deep learn|remote sensing|convolutional neural|internet of things|feature extraction|genetic algorithm|big data|artificial intelligence|data driven|support vector machine|logistic regression not p=|optimization algorithm|principal component analysis|artificial neural network|swarm optimization|regularization|linear regression not p=|optimization algorithm|random forest|cloud computing|reinforcement learning|computer vision|kalman filter|image processing|data mining|evolutionary algorithm|edge computing|supervised learning|computational modeling|pattern recognition|image classification|long short-term memor|robotics|image segmentation|convex optimization|covariance matri|attention mechanism|markov chain|object detection not brain|clustering algorithm|recurrent neural network|data augmentation|transfer learning|adversarial network|decision tree|multi agent system|fuzzy set|convolutional network|image reconstruction|data analytic|smart grid|autoencoder|fuzzy logic|radial basis function|bayesian network|dimensionality reduction|face recognition not brain|gaussian process|anomaly detection|k-nearest neighbor|natural language processing|monte carlo method|large dataset|gradient descent|support vector regression|extreme learning machine|perceptron|model selection|ensemble learning|representation learning|recommender system|target tracking|singular value decomposition|feature learning|smart city|sentiment analy|markov decision process|k-means clustering|independent component analysis|brain computer interface|human-computer interaction|markov chain monte carlo|hierarchical clustering|semantic web|semi-supervised learning|human-robot interact|knowledge graph|speech recognition not brain|ensemble model|fog computing|mapreduce|evolutionary computation|data science|text mining|generative model|active learning|swarm intelligence|multi-task learning|language model|collaborative filtering|backpropagation|machine vision|computer-aided diagnosis|gated recurrent unit|lagrange multiplier|expert system|learning rate|hadoop|markov process|nonlinear optimization|learning system|self-organizing map|smart manufacturing|smart home|few shot learning|few-shot learning|meta-learning|meta learning|adversarial training|zero-shot learning|word embedding|expectation maximization algorithm|stochastic gradient descent|ridge regression|deep belief network|non-negative matrix factorization|affective computing|latent dirichlet allocation|kernel method|kernel learning|feature engineering|variational inference|image representation|manifold learning|adversarial example|knowledge distillation|time series forecast|variational autoencoder|lasso regression|smart energy|dbscan|multi-label classification|intelligent robot|ubiquitous computing|gaussian mixture models|smart technolog|boltzmann machine|smart buildings|predictive analytic|pervasive computing|smart agriculture|capsule network|human-in-the-loop|intelligent agent|ai applications|word vector|transformer model|facial recognition|unstructured data|restricted boltzmann machine|albert|lifelong learning|autonomous agents|chatbot|cholesky decomposition|nosql|nosql|explainable ai|seq2seq|probabilistic graphical model|qr decomposition|unsupervised deep learning|data warehouse|quantum machine learning|continual learning|smart environment|multimodal learning|smart health|artificial immune system|swarm robotics|kernel machine|latent factor model|eigendecomposition|adversarial machine|adversarial machine learning|smart mobility|sequence-to-sequence model|eigen decomposition|adversarial robustness|smart parking|adversarial neural|roberta|bidirectional encoder representations from transformer|locally linear embedding|hebbian learning|one-shot learning|multimodal representation|smart tourism|entity extraction|adaptive moment estimation|ontology learning|topic modeling|relational database'"
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"keywords_oklist_source= r'..\\WOS\\kw_token_ranked_bibliometrics_okset.xlsx'\n",
"keyword_df = pd.read_excel(keywords_oklist_source)\n",
"keywords = keyword_df[keyword_df[\"u_Priority (done)\"].isin([\"High\",\"Medium\"])][\"kw_token\"].str.replace('\"','').tolist()\n",
"keywords = [kw.replace(\"*\",\"\").replace(\"$\",\"\").lower() for kw in keywords if (\"?\" not in kw and len(kw)>3)]\n",
"keywords = \"|\".join([kw for kw in keywords if kw not in [\"classifier\",\"clustering\",\"loss function\",'classification']]+[\"relational database\"])\n",
"keywords"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 55,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 56,
2 years ago
"outputs": [
{
"data": {
"text/plain": " cpc_id cpc_name \n12725 A61B1/000096 {using artificial intelligence} \\\n13764 A61B5/7264 {Classification of physiological signals or da... \n13897 A61B6/52 {Devices using data or image processing specia... \n14016 A61B8/52 {Devices using data or image processing specia... \n15252 A61B2018/0069 {using fuzzy logic} \n... ... ... \n250685 Y10S707/99946 Object-oriented database structure network \n250686 Y10S707/99947 Object-oriented database structure reference \n250687 Y10S707/99948 Application of database or data structure, e.g... \n250688 Y10S707/99951 File or database maintenance \n250703 Y10S715/968 interface for database querying and retrieval \n\n section class subclass group main_group cpc_version \n12725 A 61 B 1 000096 2023 \\\n13764 A 61 B 5 7264 2023 \n13897 A 61 B 6 52 2023 \n14016 A 61 B 8 52 2023 \n15252 A 61 B 2018 0069 2023 \n... ... ... ... ... ... ... \n250685 Y 10 S 707 99946 2023 \n250686 Y 10 S 707 99947 2023 \n250687 Y 10 S 707 99948 2023 \n250688 Y 10 S 707 99951 2023 \n250703 Y 10 S 715 968 2023 \n\n version https://git-lfs.github.com/spec/v1 \n12725 NaN \\\n13764 NaN \n13897 NaN \n14016 NaN \n15252 NaN \n... ... \n250685 NaN \n250686 NaN \n250687 NaN \n250688 NaN \n250703 NaN \n\n cpc_taxonomy \n12725 [(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... \\\n13764 [(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... \n13897 [(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... \n14016 [(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... \n15252 [(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... \n... ... \n250685 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... \n250686 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... \n250687 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... \n250688 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... \n250703 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... \n\n cpc_fullname \n12725 HUMAN NECESSITIES<>MEDICAL OR VETERINARY SCIEN... \\\n13764 HUMAN NECESSITIES<>MEDICAL OR VETERINARY SCIEN... \n13897 HUMAN NECESSITIES<>MEDICAL OR VETERINARY SCIEN... \n14016 HUMAN NECESSITIES<>MEDICAL OR VETERINARY SCIEN... \n15252 HUMAN NECESSITIES<>MEDICAL OR VETERINARY SCIEN... \n... ... \n250685 GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... \n250686 GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... \n250687 GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... \n250688 GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... \n250703 GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... \n\n tax_level_0 \n12725 HUMAN NECESSITIES \\\n13764 HUMAN NECESSITIES \n13897 HUMA
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>cpc_id</th>\n <th>cpc_name</th>\n <th>section</th>\n <th>class</th>\n <th>subclass</th>\n <th>group</th>\n <th>main_group</th>\n <th>cpc_version</th>\n <th>version https://git-lfs.github.com/spec/v1</th>\n <th>cpc_taxonomy</th>\n <th>cpc_fullname</th>\n <th>tax_level_0</th>\n <th>tax_level_1</th>\n <th>tax_level_2</th>\n <th>tax_level_3</th>\n <th>tax_level_4</th>\n <th>tax_level_5</th>\n <th>tax_level_6</th>\n <th>tax_level_7</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>12725</th>\n <td>A61B1/000096</td>\n <td>{using artificial intelligence}</td>\n <td>A</td>\n <td>61</td>\n <td>B</td>\n <td>1</td>\n <td>000096</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE...</td>\n <td>HUMAN NECESSITIES&lt;&gt;MEDICAL OR VETERINARY SCIEN...</td>\n <td>HUMAN NECESSITIES</td>\n <td>MEDICAL OR VETERINARY SCIENCE; HYGIENE</td>\n <td>DIAGNOSIS; SURGERY; IDENTIFICATION (analysing ...</td>\n <td>Instruments for performing medical examination...</td>\n <td>{of image signals during a use of endoscope}</td>\n <td>{using artificial intelligence}</td>\n <td>None</td>\n <td>None</td>\n </tr>\n <tr>\n <th>13764</th>\n <td>A61B5/7264</td>\n <td>{Classification of physiological signals or da...</td>\n <td>A</td>\n <td>61</td>\n <td>B</td>\n <td>5</td>\n <td>7264</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE...</td>\n <td>HUMAN NECESSITIES&lt;&gt;MEDICAL OR VETERINARY SCIEN...</td>\n <td>HUMAN NECESSITIES</td>\n <td>MEDICAL OR VETERINARY SCIENCE; HYGIENE</td>\n <td>DIAGNOSIS; SURGERY; IDENTIFICATION (analysing ...</td>\n <td>{Signal processing specially adapted for physi...</td>\n <td>{using Wavelet transforms}</td>\n <td>{Classification of physiological signals or da...</td>\n <td>None</td>\n <td>None</td>\n </tr>\n <tr>\n <th>13897</th>\n <td>A61B6/52</td>\n <td>{Devices using data or image processing specia...</td>\n <td>A</td>\n <td>61</td>\n <td>B</td>\n <td>6</td>\n <td>52</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE...</td>\n <td>HUMAN NECESSITIES&lt;&gt;MEDICAL OR VETERINARY SCIEN...</td>\n <td>HUMAN NECESSITIES</td>\n <td>MEDICAL OR VETERINARY SCIENCE; HYGIENE</td>\n <td>DIAGNOSIS; SURGERY; IDENTIFICATION (analysing ...</td>\n <td>{Devices using data or image processing specia...</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n </tr>\n <tr>\n <th>14016</th>\n <td>A61B8/52</td>\n <td>{Devices using data or image processing specia...</td>\n <td>A</td>\n <td>61</td>\n <td>B</td>\n <td>8</td>\n <td>52</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE...</td>\n <td>HUMAN NECESSITIES&lt;&gt;MEDICAL OR VETERINARY SCIEN...</td>\n <td>HUMAN NECESSITIES</td>\n <td>MEDICAL OR VETERINARY SCIENCE; HYGIENE</td>\n <td>DIAGNOSIS; SURGERY; IDENTIFICATION (analysing ...</td>\n <td>{Devices using data or image processing specia...</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n </tr>\n <tr>\n <th>15252</th>\n <td>A61B2018/0069</td>\n <td>{using fuzzy logic}</td>\n <td>A</td>\n <td>61</td>\n <td>B</td>\n <td>2018</td>\n
2 years ago
},
"execution_count": 56,
2 years ago
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#dummy search\n",
"scope_df = cpc_ids[cpc_ids[\"cpc_name\"].str.lower().str.contains(\"machine learn|neural network|deep learn|deep network|artificial intel*| big data|database|recommender system|computer vision|image processing|language model|language processing|fuzzy logic|principal component|image classification|video classification\", regex=True, na=False)]\n",
"scope_df"
],
"metadata": {
"collapsed": false
}
2 years ago
},
2 years ago
{
"cell_type": "code",
"execution_count": 57,
"outputs": [
{
"data": {
"text/plain": " cpc_id cpc_name \n12725 A61B1/000096 {using artificial intelligence} \\\n13746 A61B5/72 {Signal processing specially adapted for physi... \n13764 A61B5/7264 {Classification of physiological signals or da... \n13897 A61B6/52 {Devices using data or image processing specia... \n14016 A61B8/52 {Devices using data or image processing specia... \n... ... ... \n246159 Y10S128/924 using artificial intelligence \n246160 Y10S128/925 Neural network \n248454 Y10S323/909 Remote sensing \n250570 Y10S706/00 Data processing: artificial intelligence \n250571 Y10S706/90 Fuzzy logic \n\n section class subclass group main_group cpc_version \n12725 A 61 B 1 000096 2023 \\\n13746 A 61 B 5 72 2023 \n13764 A 61 B 5 7264 2023 \n13897 A 61 B 6 52 2023 \n14016 A 61 B 8 52 2023 \n... ... ... ... ... ... ... \n246159 Y 10 S 128 924 2023 \n246160 Y 10 S 128 925 2023 \n248454 Y 10 S 323 909 2023 \n250570 Y 10 S 706 00 2023 \n250571 Y 10 S 706 90 2023 \n\n version https://git-lfs.github.com/spec/v1 \n12725 NaN \\\n13746 NaN \n13764 NaN \n13897 NaN \n14016 NaN \n... ... \n246159 NaN \n246160 NaN \n248454 NaN \n250570 NaN \n250571 NaN \n\n cpc_taxonomy \n12725 [(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... \\\n13746 [(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... \n13764 [(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... \n13897 [(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... \n14016 [(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... \n... ... \n246159 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... \n246160 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... \n248454 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... \n250570 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... \n250571 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... \n\n cpc_fullname \n12725 HUMAN NECESSITIES<>MEDICAL OR VETERINARY SCIEN... \\\n13746 HUMAN NECESSITIES<>MEDICAL OR VETERINARY SCIEN... \n13764 HUMAN NECESSITIES<>MEDICAL OR VETERINARY SCIEN... \n13897 HUMAN NECESSITIES<>MEDICAL OR VETERINARY SCIEN... \n14016 HUMAN NECESSITIES<>MEDICAL OR VETERINARY SCIEN... \n... ... \n246159 GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... \n246160 GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... \n248454 GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... \n250570 GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... \n250571 GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... \n\n tax_level_0 \n12725 HUMAN NECESSITIES \\\n13746 HUMAN NECESSITIES \n13764 HUMAN NECESSITIE
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>cpc_id</th>\n <th>cpc_name</th>\n <th>section</th>\n <th>class</th>\n <th>subclass</th>\n <th>group</th>\n <th>main_group</th>\n <th>cpc_version</th>\n <th>version https://git-lfs.github.com/spec/v1</th>\n <th>cpc_taxonomy</th>\n <th>cpc_fullname</th>\n <th>tax_level_0</th>\n <th>tax_level_1</th>\n <th>tax_level_2</th>\n <th>tax_level_3</th>\n <th>tax_level_4</th>\n <th>tax_level_5</th>\n <th>tax_level_6</th>\n <th>tax_level_7</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>12725</th>\n <td>A61B1/000096</td>\n <td>{using artificial intelligence}</td>\n <td>A</td>\n <td>61</td>\n <td>B</td>\n <td>1</td>\n <td>000096</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE...</td>\n <td>HUMAN NECESSITIES&lt;&gt;MEDICAL OR VETERINARY SCIEN...</td>\n <td>HUMAN NECESSITIES</td>\n <td>MEDICAL OR VETERINARY SCIENCE; HYGIENE</td>\n <td>DIAGNOSIS; SURGERY; IDENTIFICATION (analysing ...</td>\n <td>Instruments for performing medical examination...</td>\n <td>{of image signals during a use of endoscope}</td>\n <td>{using artificial intelligence}</td>\n <td>None</td>\n <td>None</td>\n </tr>\n <tr>\n <th>13746</th>\n <td>A61B5/72</td>\n <td>{Signal processing specially adapted for physi...</td>\n <td>A</td>\n <td>61</td>\n <td>B</td>\n <td>5</td>\n <td>72</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE...</td>\n <td>HUMAN NECESSITIES&lt;&gt;MEDICAL OR VETERINARY SCIEN...</td>\n <td>HUMAN NECESSITIES</td>\n <td>MEDICAL OR VETERINARY SCIENCE; HYGIENE</td>\n <td>DIAGNOSIS; SURGERY; IDENTIFICATION (analysing ...</td>\n <td>{Signal processing specially adapted for physi...</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n </tr>\n <tr>\n <th>13764</th>\n <td>A61B5/7264</td>\n <td>{Classification of physiological signals or da...</td>\n <td>A</td>\n <td>61</td>\n <td>B</td>\n <td>5</td>\n <td>7264</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE...</td>\n <td>HUMAN NECESSITIES&lt;&gt;MEDICAL OR VETERINARY SCIEN...</td>\n <td>HUMAN NECESSITIES</td>\n <td>MEDICAL OR VETERINARY SCIENCE; HYGIENE</td>\n <td>DIAGNOSIS; SURGERY; IDENTIFICATION (analysing ...</td>\n <td>{Signal processing specially adapted for physi...</td>\n <td>{using Wavelet transforms}</td>\n <td>{Classification of physiological signals or da...</td>\n <td>None</td>\n <td>None</td>\n </tr>\n <tr>\n <th>13897</th>\n <td>A61B6/52</td>\n <td>{Devices using data or image processing specia...</td>\n <td>A</td>\n <td>61</td>\n <td>B</td>\n <td>6</td>\n <td>52</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE...</td>\n <td>HUMAN NECESSITIES&lt;&gt;MEDICAL OR VETERINARY SCIEN...</td>\n <td>HUMAN NECESSITIES</td>\n <td>MEDICAL OR VETERINARY SCIENCE; HYGIENE</td>\n <td>DIAGNOSIS; SURGERY; IDENTIFICATION (analysing ...</td>\n <td>{Devices using data or image processing specia...</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n </tr>\n <tr>\n <th>14016</th>\n <td>A61B8/52</td>\n <td>{Devices using data or image processing specia...</td>\n <td>A</td>\n <td>61</td>\n <td>B</td>\n
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#wos keyword search\n",
"scope_df = cpc_ids[cpc_ids[\"cpc_name\"].str.lower().str.contains(keywords, regex=True, na=False)]\n",
"scope_df"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 58,
"id": "6c3baa5b",
"metadata": {},
2 years ago
"outputs": [],
"source": [
"scope_ids = scope_df[\"cpc_id\"].unique()\n",
"cpc_ids[\"data_scope\"] = cpc_ids[\"cpc_id\"].isin(scope_ids)\n",
"cpc_ids.dropna(axis=1, how='all')\n",
"cpc_ids.to_csv(f\"CPC_data/cpc_defs.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 59,
"outputs": [],
"source": [
"# cpc_ids"
2 years ago
],
"metadata": {
"collapsed": false
}
},
2 years ago
{
"cell_type": "code",
"execution_count": 60,
2 years ago
"id": "2e8368b4",
"metadata": {},
"outputs": [],
2 years ago
"source": [
"# appln_data = appln.merge(appln_title, on=\"appln_id\")\n",
"# appln_data.head()"
2 years ago
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}