You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ZSI_Reconnect_China/PATSTAT/patstat_cpc_parse.ipynb

421 lines
46 KiB
Plaintext

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "code",
"execution_count": 60,
"id": "a8be6839",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import pandas as pd\n",
"import janitor\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from matplotlib.ticker import MaxNLocator\n",
"import math\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 61,
"id": "211ba466",
"metadata": {},
"outputs": [],
"source": [
"outdir=\"WESTERN_CH_scope\"\n",
"\n",
"appln = pd.read_csv(f\"{outdir}/tls_201_scope.csv\")\n",
"\n",
"appln_title = pd.read_csv(f\"{outdir}/tls_202_scope.csv\")\n",
"\n",
"pers = pd.read_csv(f\"{outdir}/tls_206_scope.csv\")\n",
"\n",
"appln_pers = pd.read_csv(f\"{outdir}/tls_207_scope.csv\")\n",
"\n",
"appln_cpc = pd.read_csv(f\"{outdir}/tls_224_scope.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "f878b151",
"metadata": {},
"outputs": [],
"source": [
"# workdir_path=r\"CPCTitleList202302\"\n",
"# # outfile='wos_extract_complete.csv'\n",
"# # with_header=True\n",
"# cpc_ids = pd.DataFrame()\n",
"# for root, dirs, files in os.walk(workdir_path):\n",
"# for filename in files:\n",
"# path=os.path.join(root, filename)\n",
"# section = pd.read_csv(path, sep='\\t', header=None)\n",
"# cpc_ids=pd.concat([cpc_ids,section], ignore_index=True)\n",
"# cpc_ids.columns =[\"cpc_id\",\"idk\",\"cpc_name\"]\n",
"# cpc_ids = cpc_ids.drop(columns=\"idk\")"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "95ea20da",
"metadata": {},
"outputs": [],
"source": [
"parsed = {x: [] for x in ['code', 'title', 'section', 'class', 'subclass', 'group', 'main_group']}\n",
"for letter in 'ABCDEFGHY':\n",
" file = f'CPC_data/CPCTitleList202302/cpc-section-{letter}_20230201.txt'\n",
" with open(file) as f:\n",
" for line in f:\n",
" vals = line.strip().split('\\t')\n",
" if len(vals) == 2:\n",
" parsed['code'].append(vals[0])\n",
" parsed['title'].append(vals[1])\n",
" elif len(vals) == 3:\n",
" parsed['code'].append(vals[0])\n",
" parsed['title'].append(vals[2])\n",
"\n",
"\n",
"\n",
"for i in range(len(parsed['code'])):\n",
" code = parsed['code'][i]\n",
" main_group = code.split('/')[-1] if \"/\" in code else None\n",
" group = code.split('/')[0][4:] if len(code) >= 5 else None\n",
" subclass = code[3] if len(code) >= 4 else None\n",
" class_ = code[1:3] if len(code) >= 3 else None\n",
" section = code[0] if len(code) >= 1 else None\n",
" \n",
" parsed['main_group'].append(main_group)\n",
" parsed['group'].append(group)\n",
" parsed['subclass'].append(subclass)\n",
" parsed['class'].append(class_)\n",
" parsed['section'].append(section)\n",
"\n",
"cpc_ids2023 = pd.DataFrame.from_dict(parsed)\n",
"cpc_ids2023['cpc_version']=2023\n",
"cpc_ids2022 = pd.read_csv(\"CPC_data/cpc_titles_2022.csv\")\n",
"cpc_ids2022['cpc_version']=2022\n",
"cpc_ids = pd.concat([cpc_ids2023,cpc_ids2022], ignore_index=True)\n",
"cpc_ids = cpc_ids.rename(columns={\"code\":\"cpc_id\",\"title\":\"cpc_name\"}).drop_duplicates(subset=\"cpc_id\")"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "907d9c3e",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 64,
"id": "1be8971a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"70 cpc_ids not found\n",
"0.07344840249724569 % lost\n"
]
}
],
"source": [
"appln_cpc[\"cpc_id\"] = appln_cpc[\"cpc_class_symbol\"].str.replace(\" \",\"\")\n",
"appln_cpc_tax = appln_cpc.merge(cpc_ids, on=\"cpc_id\", how=\"left\")\n",
"\n",
"print(len(appln_cpc_tax[appln_cpc_tax[\"cpc_name\"].isna()][\"cpc_id\"].unique()), \"cpc_ids not found\")\n",
"print(len(appln_cpc_tax[appln_cpc_tax[\"cpc_name\"].isna()][\"cpc_id\"].unique())/len(appln_cpc_tax[\"cpc_id\"].unique())*100, \"% lost\")"
]
},
{
"cell_type": "code",
"execution_count": 65,
"id": "b1274c34",
"metadata": {},
"outputs": [],
"source": [
"cpc_dict = dict(zip(cpc_ids.cpc_id.str.replace(\" \",\"\"), cpc_ids.cpc_name))\n",
"# cpc_dict"
]
},
{
"cell_type": "code",
"execution_count": 66,
"id": "2a7e39ee",
"metadata": {},
"outputs": [],
"source": [
"def cpc_classifier(id_text):\n",
" taxonomy = []\n",
" iter_text = id_text.replace(\" \",\"\")\n",
" for i in range(len(iter_text)+1):\n",
" tax_id = iter_text[:i]\n",
" tax_name = cpc_dict.get(iter_text[:i])\n",
" if tax_name:\n",
" taxonomy.append((tax_id,tax_name))\n",
" return taxonomy\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 67,
"id": "e31a013f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "[('A', 'HUMAN NECESSITIES'),\n ('A01',\n 'AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTING; TRAPPING; FISHING'),\n ('A01B',\n 'SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS, DETAILS, OR ACCESSORIES OF AGRICULTURAL MACHINES OR IMPLEMENTS, IN GENERAL (making or covering furrows or holes for sowing, planting, or manuring A01C5/00; soil working for engineering purposes E01, E02, E21; {measuring areas for agricultural purposes G01B})'),\n ('A01B1/06',\n 'Hoes; Hand cultivators {(rakes A01D7/00; forks A01D9/00; picks B25D)}'),\n ('A01B1/065', '{powered}')]"
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cpc_classifier(\"A01B1/065\")"
]
},
{
"cell_type": "code",
"execution_count": 68,
"id": "f09a616c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": " cpc_id cpc_name section class \n0 A HUMAN NECESSITIES A None \\\n1 A01 AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI... A 01 \n2 A01B SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS... A 01 \n3 A01B1/00 Hand tools (edge trimmers for lawns A01G3/06 ... A 01 \n4 A01B1/02 Spades; Shovels {(hand-operated dredgers E02F3... A 01 \n\n subclass group main_group cpc_version \n0 None None None 2023 \\\n1 None None None 2023 \n2 B None None 2023 \n3 B 1 00 2023 \n4 B 1 02 2023 \n\n version https://git-lfs.github.com/spec/v1 \n0 NaN \\\n1 NaN \n2 NaN \n3 NaN \n4 NaN \n\n cpc_taxonomy \n0 [(A, HUMAN NECESSITIES)] \n1 [(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... \n2 [(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... \n3 [(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... \n4 [(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>cpc_id</th>\n <th>cpc_name</th>\n <th>section</th>\n <th>class</th>\n <th>subclass</th>\n <th>group</th>\n <th>main_group</th>\n <th>cpc_version</th>\n <th>version https://git-lfs.github.com/spec/v1</th>\n <th>cpc_taxonomy</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>A</td>\n <td>HUMAN NECESSITIES</td>\n <td>A</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES)]</td>\n </tr>\n <tr>\n <th>1</th>\n <td>A01</td>\n <td>AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI...</td>\n <td>A</td>\n <td>01</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO...</td>\n </tr>\n <tr>\n <th>2</th>\n <td>A01B</td>\n <td>SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS...</td>\n <td>A</td>\n <td>01</td>\n <td>B</td>\n <td>None</td>\n <td>None</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>A01B1/00</td>\n <td>Hand tools (edge trimmers for lawns A01G3/06 ...</td>\n <td>A</td>\n <td>01</td>\n <td>B</td>\n <td>1</td>\n <td>00</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO...</td>\n </tr>\n <tr>\n <th>4</th>\n <td>A01B1/02</td>\n <td>Spades; Shovels {(hand-operated dredgers E02F3...</td>\n <td>A</td>\n <td>01</td>\n <td>B</td>\n <td>1</td>\n <td>02</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO...</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cpc_ids[\"cpc_taxonomy\"] = cpc_ids[\"cpc_id\"].fillna(\"\").map(cpc_classifier)\n",
"cpc_ids.head()"
]
},
{
"cell_type": "code",
"execution_count": 69,
"id": "f3fa8bf3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"70 cpc_ids not found\n",
"0.07344840249724569 % lost\n"
]
}
],
"source": [
"appln_cpc[\"cpc_id\"] = appln_cpc[\"cpc_class_symbol\"].str.replace(\" \",\"\")\n",
"appln_cpc_tax = appln_cpc.merge(cpc_ids, on=\"cpc_id\", how=\"left\")\n",
"print(len(appln_cpc_tax[appln_cpc_tax[\"cpc_name\"].isna()][\"cpc_id\"].unique()), \"cpc_ids not found\")\n",
"print(len(appln_cpc_tax[appln_cpc_tax[\"cpc_name\"].isna()][\"cpc_id\"].unique())/len(appln_cpc_tax[\"cpc_id\"].unique())*100, \"% lost\")"
]
},
{
"cell_type": "code",
"execution_count": 70,
"id": "58701721",
"metadata": {},
"outputs": [],
"source": [
"# appln_cpc_tax[appln_cpc_tax[\"cpc_name\"].isna()][\"cpc_id\"].unique()"
]
},
{
"cell_type": "markdown",
"id": "ca631acf",
"metadata": {},
"source": [
"## 'AI/Big Data' keywords"
]
},
{
"cell_type": "code",
"execution_count": 71,
"outputs": [
{
"data": {
"text/plain": "'neural network|machine learn|deep learn|remote sensing|convolutional neural|internet of things|feature extraction|genetic algorithm|big data|artificial intelligence|data driven|support vector machine|logistic regression not p=|optimization algorithm|principal component analysis|artificial neural network|swarm optimization|regularization|linear regression not p=|optimization algorithm|random forest|cloud computing|reinforcement learning|computer vision|kalman filter|image processing|data mining|evolutionary algorithm|edge computing|supervised learning|computational modeling|pattern recognition|image classification|long short-term memor|robotics|image segmentation|convex optimization|covariance matri|attention mechanism|markov chain|object detection not brain|clustering algorithm|recurrent neural network|data augmentation|transfer learning|adversarial network|decision tree|multi agent system|fuzzy set|convolutional network|image reconstruction|data analytic|smart grid|autoencoder|fuzzy logic|radial basis function|bayesian network|dimensionality reduction|face recognition not brain|gaussian process|anomaly detection|k-nearest neighbor|natural language processing|monte carlo method|large dataset|gradient descent|support vector regression|extreme learning machine|perceptron|model selection|ensemble learning|representation learning|recommender system|target tracking|singular value decomposition|feature learning|smart city|sentiment analy|markov decision process|k-means clustering|independent component analysis|brain computer interface|human-computer interaction|markov chain monte carlo|hierarchical clustering|semantic web|semi-supervised learning|human-robot interact|knowledge graph|speech recognition not brain|ensemble model|fog computing|mapreduce|evolutionary computation|data science|text mining|generative model|active learning|swarm intelligence|multi-task learning|language model|collaborative filtering|backpropagation|machine vision|computer-aided diagnosis|gated recurrent unit|lagrange multiplier|expert system|learning rate|hadoop|markov process|nonlinear optimization|learning system|self-organizing map|smart manufacturing|smart home|few shot learning|few-shot learning|meta-learning|meta learning|adversarial training|zero-shot learning|word embedding|expectation maximization algorithm|stochastic gradient descent|ridge regression|deep belief network|non-negative matrix factorization|affective computing|latent dirichlet allocation|kernel method|kernel learning|feature engineering|variational inference|image representation|manifold learning|adversarial example|knowledge distillation|time series forecast|variational autoencoder|lasso regression|smart energy|dbscan|multi-label classification|intelligent robot|ubiquitous computing|gaussian mixture models|smart technolog|boltzmann machine|smart buildings|predictive analytic|pervasive computing|smart agriculture|capsule network|human-in-the-loop|intelligent agent|ai applications|word vector|transformer model|facial recognition|unstructured data|restricted boltzmann machine|albert|lifelong learning|autonomous agents|chatbot|cholesky decomposition|nosql|nosql|explainable ai|seq2seq|probabilistic graphical model|qr decomposition|unsupervised deep learning|data warehouse|quantum machine learning|continual learning|smart environment|multimodal learning|smart health|artificial immune system|swarm robotics|kernel machine|latent factor model|eigendecomposition|adversarial machine|adversarial machine learning|smart mobility|sequence-to-sequence model|eigen decomposition|adversarial robustness|smart parking|adversarial neural|roberta|bidirectional encoder representations from transformer|locally linear embedding|hebbian learning|one-shot learning|multimodal representation|smart tourism|entity extraction|adaptive moment estimation|ontology learning|topic modeling|relational database'"
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"keywords_oklist_source= r'..\\WOS\\kw_token_ranked_bibliometrics_okset.xlsx'\n",
"keyword_df = pd.read_excel(keywords_oklist_source)\n",
"keywords = keyword_df[keyword_df[\"u_Priority (done)\"].isin([\"High\",\"Medium\"])][\"kw_token\"].str.replace('\"','').tolist()\n",
"keywords = [kw.replace(\"*\",\"\").replace(\"$\",\"\").lower() for kw in keywords if (\"?\" not in kw and len(kw)>3)]\n",
"keywords = \"|\".join([kw for kw in keywords if kw not in [\"classifier\",\"clustering\",\"loss function\",'classification']]+[\"relational database\"])\n",
"keywords"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 72,
"outputs": [
{
"data": {
"text/plain": " cpc_id cpc_name \n12725 A61B1/000096 {using artificial intelligence} \\\n13764 A61B5/7264 {Classification of physiological signals or da... \n13897 A61B6/52 {Devices using data or image processing specia... \n14016 A61B8/52 {Devices using data or image processing specia... \n15252 A61B2018/0069 {using fuzzy logic} \n... ... ... \n250685 Y10S707/99946 Object-oriented database structure network \n250686 Y10S707/99947 Object-oriented database structure reference \n250687 Y10S707/99948 Application of database or data structure, e.g... \n250688 Y10S707/99951 File or database maintenance \n250703 Y10S715/968 interface for database querying and retrieval \n\n section class subclass group main_group cpc_version \n12725 A 61 B 1 000096 2023 \\\n13764 A 61 B 5 7264 2023 \n13897 A 61 B 6 52 2023 \n14016 A 61 B 8 52 2023 \n15252 A 61 B 2018 0069 2023 \n... ... ... ... ... ... ... \n250685 Y 10 S 707 99946 2023 \n250686 Y 10 S 707 99947 2023 \n250687 Y 10 S 707 99948 2023 \n250688 Y 10 S 707 99951 2023 \n250703 Y 10 S 715 968 2023 \n\n version https://git-lfs.github.com/spec/v1 \n12725 NaN \\\n13764 NaN \n13897 NaN \n14016 NaN \n15252 NaN \n... ... \n250685 NaN \n250686 NaN \n250687 NaN \n250688 NaN \n250703 NaN \n\n cpc_taxonomy \n12725 [(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... \n13764 [(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... \n13897 [(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... \n14016 [(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... \n15252 [(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... \n... ... \n250685 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... \n250686 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... \n250687 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... \n250688 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... \n250703 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... \n\n[317 rows x 10 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>cpc_id</th>\n <th>cpc_name</th>\n <th>section</th>\n <th>class</th>\n <th>subclass</th>\n <th>group</th>\n <th>main_group</th>\n <th>cpc_version</th>\n <th>version https://git-lfs.github.com/spec/v1</th>\n <th>cpc_taxonomy</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>12725</th>\n <td>A61B1/000096</td>\n <td>{using artificial intelligence}</td>\n <td>A</td>\n <td>61</td>\n <td>B</td>\n <td>1</td>\n <td>000096</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE...</td>\n </tr>\n <tr>\n <th>13764</th>\n <td>A61B5/7264</td>\n <td>{Classification of physiological signals or da...</td>\n <td>A</td>\n <td>61</td>\n <td>B</td>\n <td>5</td>\n <td>7264</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE...</td>\n </tr>\n <tr>\n <th>13897</th>\n <td>A61B6/52</td>\n <td>{Devices using data or image processing specia...</td>\n <td>A</td>\n <td>61</td>\n <td>B</td>\n <td>6</td>\n <td>52</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE...</td>\n </tr>\n <tr>\n <th>14016</th>\n <td>A61B8/52</td>\n <td>{Devices using data or image processing specia...</td>\n <td>A</td>\n <td>61</td>\n <td>B</td>\n <td>8</td>\n <td>52</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE...</td>\n </tr>\n <tr>\n <th>15252</th>\n <td>A61B2018/0069</td>\n <td>{using fuzzy logic}</td>\n <td>A</td>\n <td>61</td>\n <td>B</td>\n <td>2018</td>\n <td>0069</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE...</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>250685</th>\n <td>Y10S707/99946</td>\n <td>Object-oriented database structure network</td>\n <td>Y</td>\n <td>10</td>\n <td>S</td>\n <td>707</td>\n <td>99946</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE...</td>\n </tr>\n <tr>\n <th>250686</th>\n <td>Y10S707/99947</td>\n <td>Object-oriented database structure reference</td>\n <td>Y</td>\n <td>10</td>\n <td>S</td>\n <td>707</td>\n <td>99947</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE...</td>\n </tr>\n <tr>\n <th>250687</th>\n <td>Y10S707/99948</td>\n <td>Application of database or data structure, e.g...</td>\n <td>Y</td>\n <td>10</td>\n <td>S</td>\n <td>707</td>\n <td>99948</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE...</td>\n </tr>\n <tr>\n <th>250688</th>\n <td>Y10S707/99951</td>\n <td>File or database maintenance</td>\n <td>Y</td>\n <td>10</td>\n <td>S</td>\n <td>707</td>\n <td>99951</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE...</td>\n </tr>\n <tr>\n <th>250703</th>\n <td>Y10S715/968</td>\n <td>interface for database querying and retrieval</td>\n <td>Y</td>\n <td>10</td>\n <td>S</td>\n <td>715</td>\n <td>968</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE...</td>\n </tr>\n </tbody>\n</table>\n<p>317 rows × 10 columns</p>\n</div>"
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#dummy search\n",
"scope_df = cpc_ids[cpc_ids[\"cpc_name\"].str.lower().str.contains(\"machine learn|neural network|deep learn|deep network|artificial intel*| big data|database|recommender system|computer vision|image processing|language model|language processing|fuzzy logic|principal component|image classification|video classification\", regex=True, na=False)]\n",
"scope_df"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 73,
"outputs": [
{
"data": {
"text/plain": " cpc_id cpc_name \n12725 A61B1/000096 {using artificial intelligence} \\\n13746 A61B5/72 {Signal processing specially adapted for physi... \n13764 A61B5/7264 {Classification of physiological signals or da... \n13897 A61B6/52 {Devices using data or image processing specia... \n14016 A61B8/52 {Devices using data or image processing specia... \n... ... ... \n246159 Y10S128/924 using artificial intelligence \n246160 Y10S128/925 Neural network \n248454 Y10S323/909 Remote sensing \n250570 Y10S706/00 Data processing: artificial intelligence \n250571 Y10S706/90 Fuzzy logic \n\n section class subclass group main_group cpc_version \n12725 A 61 B 1 000096 2023 \\\n13746 A 61 B 5 72 2023 \n13764 A 61 B 5 7264 2023 \n13897 A 61 B 6 52 2023 \n14016 A 61 B 8 52 2023 \n... ... ... ... ... ... ... \n246159 Y 10 S 128 924 2023 \n246160 Y 10 S 128 925 2023 \n248454 Y 10 S 323 909 2023 \n250570 Y 10 S 706 00 2023 \n250571 Y 10 S 706 90 2023 \n\n version https://git-lfs.github.com/spec/v1 \n12725 NaN \\\n13746 NaN \n13764 NaN \n13897 NaN \n14016 NaN \n... ... \n246159 NaN \n246160 NaN \n248454 NaN \n250570 NaN \n250571 NaN \n\n cpc_taxonomy \n12725 [(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... \n13746 [(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... \n13764 [(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... \n13897 [(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... \n14016 [(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... \n... ... \n246159 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... \n246160 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... \n248454 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... \n250570 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... \n250571 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... \n\n[358 rows x 10 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>cpc_id</th>\n <th>cpc_name</th>\n <th>section</th>\n <th>class</th>\n <th>subclass</th>\n <th>group</th>\n <th>main_group</th>\n <th>cpc_version</th>\n <th>version https://git-lfs.github.com/spec/v1</th>\n <th>cpc_taxonomy</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>12725</th>\n <td>A61B1/000096</td>\n <td>{using artificial intelligence}</td>\n <td>A</td>\n <td>61</td>\n <td>B</td>\n <td>1</td>\n <td>000096</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE...</td>\n </tr>\n <tr>\n <th>13746</th>\n <td>A61B5/72</td>\n <td>{Signal processing specially adapted for physi...</td>\n <td>A</td>\n <td>61</td>\n <td>B</td>\n <td>5</td>\n <td>72</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE...</td>\n </tr>\n <tr>\n <th>13764</th>\n <td>A61B5/7264</td>\n <td>{Classification of physiological signals or da...</td>\n <td>A</td>\n <td>61</td>\n <td>B</td>\n <td>5</td>\n <td>7264</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE...</td>\n </tr>\n <tr>\n <th>13897</th>\n <td>A61B6/52</td>\n <td>{Devices using data or image processing specia...</td>\n <td>A</td>\n <td>61</td>\n <td>B</td>\n <td>6</td>\n <td>52</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE...</td>\n </tr>\n <tr>\n <th>14016</th>\n <td>A61B8/52</td>\n <td>{Devices using data or image processing specia...</td>\n <td>A</td>\n <td>61</td>\n <td>B</td>\n <td>8</td>\n <td>52</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE...</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>246159</th>\n <td>Y10S128/924</td>\n <td>using artificial intelligence</td>\n <td>Y</td>\n <td>10</td>\n <td>S</td>\n <td>128</td>\n <td>924</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE...</td>\n </tr>\n <tr>\n <th>246160</th>\n <td>Y10S128/925</td>\n <td>Neural network</td>\n <td>Y</td>\n <td>10</td>\n <td>S</td>\n <td>128</td>\n <td>925</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE...</td>\n </tr>\n <tr>\n <th>248454</th>\n <td>Y10S323/909</td>\n <td>Remote sensing</td>\n <td>Y</td>\n <td>10</td>\n <td>S</td>\n <td>323</td>\n <td>909</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE...</td>\n </tr>\n <tr>\n <th>250570</th>\n <td>Y10S706/00</td>\n <td>Data processing: artificial intelligence</td>\n <td>Y</td>\n <td>10</td>\n <td>S</td>\n <td>706</td>\n <td>00</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE...</td>\n </tr>\n <tr>\n <th>250571</th>\n <td>Y10S706/90</td>\n <td>Fuzzy logic</td>\n <td>Y</td>\n <td>10</td>\n <td>S</td>\n <td>706</td>\n <td>90</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE...</td>\n </tr>\n </tbody>\n</table>\n<p>358 rows × 10 columns</p>\n</div>"
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"scope_df = cpc_ids[cpc_ids[\"cpc_name\"].str.lower().str.contains(keywords, regex=True, na=False)]\n",
"scope_df"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 74,
"id": "6c3baa5b",
"metadata": {},
"outputs": [],
"source": [
"scope_ids = scope_df[\"cpc_id\"].unique()\n",
"cpc_ids[\"data_scope\"] = cpc_ids[\"cpc_id\"].isin(scope_ids)\n",
"cpc_ids.to_csv(f\"{outdir}/cpc_defs.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 75,
"outputs": [
{
"data": {
"text/plain": "'WESTERN_CH_scope'"
},
"execution_count": 75,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"outdir"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 76,
"outputs": [
{
"data": {
"text/plain": " cpc_id cpc_name \n0 A HUMAN NECESSITIES \\\n1 A01 AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI... \n2 A01B SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS... \n3 A01B1/00 Hand tools (edge trimmers for lawns A01G3/06 ... \n4 A01B1/02 Spades; Shovels {(hand-operated dredgers E02F3... \n... ... ... \n260486 Y10T483/1873 Indexing matrix \n260487 Y10T483/1882 Rotary disc \n260488 Y10T483/1891 Chain or belt \n260489 Y10T483/19 Miscellaneous \n260490 NaN NaN \n\n section class subclass group main_group cpc_version \n0 A None None None None 2023 \\\n1 A 01 None None None 2023 \n2 A 01 B None None 2023 \n3 A 01 B 1 00 2023 \n4 A 01 B 1 02 2023 \n... ... ... ... ... ... ... \n260486 Y 10 T 483 1873 2023 \n260487 Y 10 T 483 1882 2023 \n260488 Y 10 T 483 1891 2023 \n260489 Y 10 T 483 19 2023 \n260490 NaN NaN NaN NaN NaN 2022 \n\n version https://git-lfs.github.com/spec/v1 \n0 NaN \\\n1 NaN \n2 NaN \n3 NaN \n4 NaN \n... ... \n260486 NaN \n260487 NaN \n260488 NaN \n260489 NaN \n260490 oid sha256:f138d6bdf2939ba576b96b633d81366123b... \n\n cpc_taxonomy data_scope \n0 [(A, HUMAN NECESSITIES)] False \n1 [(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... False \n2 [(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... False \n3 [(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... False \n4 [(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... False \n... ... ... \n260486 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... False \n260487 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... False \n260488 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... False \n260489 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... False \n260490 [] False \n\n[260491 rows x 11 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>cpc_id</th>\n <th>cpc_name</th>\n <th>section</th>\n <th>class</th>\n <th>subclass</th>\n <th>group</th>\n <th>main_group</th>\n <th>cpc_version</th>\n <th>version https://git-lfs.github.com/spec/v1</th>\n <th>cpc_taxonomy</th>\n <th>data_scope</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>A</td>\n <td>HUMAN NECESSITIES</td>\n <td>A</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES)]</td>\n <td>False</td>\n </tr>\n <tr>\n <th>1</th>\n <td>A01</td>\n <td>AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI...</td>\n <td>A</td>\n <td>01</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO...</td>\n <td>False</td>\n </tr>\n <tr>\n <th>2</th>\n <td>A01B</td>\n <td>SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS...</td>\n <td>A</td>\n <td>01</td>\n <td>B</td>\n <td>None</td>\n <td>None</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO...</td>\n <td>False</td>\n </tr>\n <tr>\n <th>3</th>\n <td>A01B1/00</td>\n <td>Hand tools (edge trimmers for lawns A01G3/06 ...</td>\n <td>A</td>\n <td>01</td>\n <td>B</td>\n <td>1</td>\n <td>00</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO...</td>\n <td>False</td>\n </tr>\n <tr>\n <th>4</th>\n <td>A01B1/02</td>\n <td>Spades; Shovels {(hand-operated dredgers E02F3...</td>\n <td>A</td>\n <td>01</td>\n <td>B</td>\n <td>1</td>\n <td>02</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO...</td>\n <td>False</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>260486</th>\n <td>Y10T483/1873</td>\n <td>Indexing matrix</td>\n <td>Y</td>\n <td>10</td>\n <td>T</td>\n <td>483</td>\n <td>1873</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE...</td>\n <td>False</td>\n </tr>\n <tr>\n <th>260487</th>\n <td>Y10T483/1882</td>\n <td>Rotary disc</td>\n <td>Y</td>\n <td>10</td>\n <td>T</td>\n <td>483</td>\n <td>1882</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE...</td>\n <td>False</td>\n </tr>\n <tr>\n <th>260488</th>\n <td>Y10T483/1891</td>\n <td>Chain or belt</td>\n <td>Y</td>\n <td>10</td>\n <td>T</td>\n <td>483</td>\n <td>1891</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE...</td>\n <td>False</td>\n </tr>\n <tr>\n <th>260489</th>\n <td>Y10T483/19</td>\n <td>Miscellaneous</td>\n <td>Y</td>\n <td>10</td>\n <td>T</td>\n <td>483</td>\n <td>19</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE...</td>\n <td>False</td>\n </tr>\n <tr>\n <th>260490</th>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>2022</td>\n <td>oid sha256:f138d6bdf2939ba576b96b633d81366123b...</td>\n <td>[]</td>\n <td>False</td>\n </tr>\n </tbody>\n</table>\n<p>260491 rows × 11 columns</p>\n</div>"
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cpc_ids"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 13,
"id": "2e8368b4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": " appln_id appln_auth appln_nr appln_kind appln_filing_date \n0 340657036 EP 12000117 A 2012-01-09 \\\n1 340982410 EP 12151915 A 2012-01-20 \n2 341078960 EP 12700310 A 2012-01-11 \n3 341078962 EP 12700311 A 2012-01-11 \n4 341127772 EP 12700372 A 2012-01-02 \n\n appln_filing_year appln_nr_original ipr_type receiving_office \n0 2012 12000117 PI \\\n1 2012 12151915 PI \n2 2012 12700310 PI \n3 2012 12700311 PI \n4 2012 12700372 PI \n\n internat_appln_id ... earliest_pat_publn_id granted docdb_family_id \n0 0 ... 407623142 Y 45507394 \\\n1 0 ... 365158710 Y 45531220 \n2 340778427 ... 413564969 Y 45491582 \n3 340778431 ... 413564970 Y 45491583 \n4 340460188 ... 421840120 Y 45495923 \n\n inpadoc_family_id docdb_family_size nb_citing_docdb_fam nb_applicants \n0 340657036 3 6 1 \\\n1 340982410 2 16 2 \n2 340778427 3 2 1 \n3 340778431 3 3 1 \n4 340460188 4 8 1 \n\n nb_inventors appln_title_lg \n0 2 en \\\n1 6 en \n2 1 en \n3 1 en \n4 2 en \n\n appln_title \n0 Rotating membrane filter disc apparatus \n1 Heating-Cooling-Capacity measurement controlli... \n2 TRANSMISSION DEVICE \n3 TRANSMISSION DEVICE \n4 POWER CONTROL IN A WIRELESS COMMUNICATION SYST... \n\n[5 rows x 28 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>appln_id</th>\n <th>appln_auth</th>\n <th>appln_nr</th>\n <th>appln_kind</th>\n <th>appln_filing_date</th>\n <th>appln_filing_year</th>\n <th>appln_nr_original</th>\n <th>ipr_type</th>\n <th>receiving_office</th>\n <th>internat_appln_id</th>\n <th>...</th>\n <th>earliest_pat_publn_id</th>\n <th>granted</th>\n <th>docdb_family_id</th>\n <th>inpadoc_family_id</th>\n <th>docdb_family_size</th>\n <th>nb_citing_docdb_fam</th>\n <th>nb_applicants</th>\n <th>nb_inventors</th>\n <th>appln_title_lg</th>\n <th>appln_title</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>340657036</td>\n <td>EP</td>\n <td>12000117</td>\n <td>A</td>\n <td>2012-01-09</td>\n <td>2012</td>\n <td>12000117</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>407623142</td>\n <td>Y</td>\n <td>45507394</td>\n <td>340657036</td>\n <td>3</td>\n <td>6</td>\n <td>1</td>\n <td>2</td>\n <td>en</td>\n <td>Rotating membrane filter disc apparatus</td>\n </tr>\n <tr>\n <th>1</th>\n <td>340982410</td>\n <td>EP</td>\n <td>12151915</td>\n <td>A</td>\n <td>2012-01-20</td>\n <td>2012</td>\n <td>12151915</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>365158710</td>\n <td>Y</td>\n <td>45531220</td>\n <td>340982410</td>\n <td>2</td>\n <td>16</td>\n <td>2</td>\n <td>6</td>\n <td>en</td>\n <td>Heating-Cooling-Capacity measurement controlli...</td>\n </tr>\n <tr>\n <th>2</th>\n <td>341078960</td>\n <td>EP</td>\n <td>12700310</td>\n <td>A</td>\n <td>2012-01-11</td>\n <td>2012</td>\n <td>12700310</td>\n <td>PI</td>\n <td></td>\n <td>340778427</td>\n <td>...</td>\n <td>413564969</td>\n <td>Y</td>\n <td>45491582</td>\n <td>340778427</td>\n <td>3</td>\n <td>2</td>\n <td>1</td>\n <td>1</td>\n <td>en</td>\n <td>TRANSMISSION DEVICE</td>\n </tr>\n <tr>\n <th>3</th>\n <td>341078962</td>\n <td>EP</td>\n <td>12700311</td>\n <td>A</td>\n <td>2012-01-11</td>\n <td>2012</td>\n <td>12700311</td>\n <td>PI</td>\n <td></td>\n <td>340778431</td>\n <td>...</td>\n <td>413564970</td>\n <td>Y</td>\n <td>45491583</td>\n <td>340778431</td>\n <td>3</td>\n <td>3</td>\n <td>1</td>\n <td>1</td>\n <td>en</td>\n <td>TRANSMISSION DEVICE</td>\n </tr>\n <tr>\n <th>4</th>\n <td>341127772</td>\n <td>EP</td>\n <td>12700372</td>\n <td>A</td>\n <td>2012-01-02</td>\n <td>2012</td>\n <td>12700372</td>\n <td>PI</td>\n <td></td>\n <td>340460188</td>\n <td>...</td>\n <td>421840120</td>\n <td>Y</td>\n <td>45495923</td>\n <td>340460188</td>\n <td>4</td>\n <td>8</td>\n <td>1</td>\n <td>2</td>\n <td>en</td>\n <td>POWER CONTROL IN A WIRELESS COMMUNICATION SYST...</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 28 columns</p>\n</div>"
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"appln_data = appln.merge(appln_title, on=\"appln_id\")\n",
"appln_data.head()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}