"keywords = [kw.replace(\"*\",\"\").replace(\"$\",\"\").lower() for kw in keywords if (\"?\" not in kw and len(kw)>3)]\n",
"keywords = \"|\".join([kw for kw in keywords if kw not in [\"classifier\",\"clustering\",\"loss function\",'classification']]+[\"relational database\"])\n",
"keywords"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 54,
"outputs": [
{
"data": {
"text/plain": " cpc_id cpc_name \n12725 A61B1/000096 {using artificial intelligence} \\\n13764 A61B5/7264 {Classification of physiological signals or da... \n13897 A61B6/52 {Devices using data or image processing specia... \n14016 A61B8/52 {Devices using data or image processing specia... \n15252 A61B2018/0069 {using fuzzy logic} \n... ... ... \n250685 Y10S707/99946 Object-oriented database structure network \n250686 Y10S707/99947 Object-oriented database structure reference \n250687 Y10S707/99948 Application of database or data structure, e.g... \n250688 Y10S707/99951 File or database maintenance \n250703 Y10S715/968 interface for database querying and retrieval \n\n section class subclass group main_group cpc_version \n12725 A 61 B 1 000096 2023 \\\n13764 A 61 B 5 7264 2023 \n13897 A 61 B 6 52 2023 \n14016 A 61 B 8 52 2023 \n15252 A 61 B 2018 0069 2023 \n... ... ... ... ... ... ... \n250685 Y 10 S 707 99946 2023 \n250686 Y 10 S 707 99947 2023 \n250687 Y 10 S 707 99948 2023 \n250688 Y 10 S 707 99951 2023 \n250703 Y 10 S 715 968 2023 \n\n version https://git-lfs.github.com/spec/v1 \n12725 NaN \\\n13764 NaN \n13897 NaN \n14016 NaN \n15252 NaN \n... ... \n250685 NaN \n250686 NaN \n250687 NaN \n250688 NaN \n250703 NaN \n\n cpc_taxonomy \n12725 [(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... \n13764 [(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... \n13897 [(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... \n14016 [(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... \n15252 [(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... \n... ... \n250685 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... \n250686 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... \n250687 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... \n250688 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... \n250703 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... \n\n[317 rows x 10 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>cpc_id</th>\n <th>cpc_name</th>\n <th>section</th>\n <th>class</th>\n <th>subclass</th>\n <th>group</th>\n <th>main_group</th>\n <th>cpc_version</th>\n <th>version https://git-lfs.github.com/spec/v1</th>\n <th>cpc_taxonomy</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>12725</th>\n <td>A61B1/000096</td>\n <td>{using artificial intelligence}</td>\n <td>A</td>\n <td>61</td>\n <td>B</td>\n <td>1</td>\n <td>000096</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE...</td>\n </tr>\n <tr>\n <th>13764</th>\n <td>A61B5/7264</td>\n <td>{Classification of physiological signals or da...</td>\n <td>A</td>\n <td>61</td>\n <td>B</td>\n <td>5</td>\n <td>7264</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE...</td>\n </tr>\n <tr>\n <th>13897</th>\n <td>A61B6/52</td>\n <td>{Devices using data or image processing specia...</td>\n <td>A</td>\n <td>61</td>\n <td>B</td>\n <td>6</td>\n <td>52</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE...</td>\n </tr>\n <tr>\n <th>14016</th>\n <td>A61B8/52</td>\n <td>{Devices using data or image processing specia...</td>\n <td>A</td>\n <td>61</td>\n <td>B</td>\n <td>8</td>\n <td>52</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE...</td>\n </tr>\n <tr>\n <th>15252</th>\n <td>A61B2018/0069</td>\n <td>{using fuzzy logic}</td>\n <td>A</td>\n <td>61</td>\n <td>B</td>\n <td>2018</td>\n <td>0069</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE...</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>250685</th>\n <td>Y10S707/99946</td>\n <td>Object-oriented database structure network</td>\n <td>Y</td>\n <td>10</td>\n <td>S</td>\n <td>707</td>\n <td>99946</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE...</td>\n </tr>\n <tr>\n <th>250686</th>\n <td>Y10S707/99947</td>\n <td>Object-oriented database structure reference</td>\n <td>Y</td>\n <td>10</td>\n <td>S</td>\n <td>707</td>\n <td>99947</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE...</td>\n </tr>\n <tr>\n <th>250687</th>\n <td>Y10S707/99948</td>\n <td>Application of database or data structure, e.g...</td>\n <td>Y</td>\n <td>10</td>\n <td>S</td>\n <td>707</td>\n <td>99948</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE...</td>\n </tr>\n <tr>\n <th>250688</th>\n <td>Y10S707/99951</td>\n <td>File or database maintenance</td>\n <td>Y</td>\n <td>10</td>\n <td>S</td>\n <td>707</td>\n <td>99951</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE...</td>\n </tr>\n <tr>\n <th>250703</th>\n <td>Y10S715/968</td>\n <td>interface for database querying and retrieval</td>\n <td>Y</td>\n <td>10</td>\n <td>S</td>\n <td>715</td>\n <td>968</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE...</td>\n </tr>\n </tbody>\n</table>\n<p>317 rows × 10 columns</p>\n</div>"
"text/plain": " cpc_id cpc_name \n0 A HUMAN NECESSITIES \\\n1 A01 AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI... \n2 A01B SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS... \n3 A01B1/00 Hand tools (edge trimmers for lawns A01G3/06 ... \n4 A01B1/02 Spades; Shovels {(hand-operated dredgers E02F3... \n... ... ... \n260486 Y10T483/1873 Indexing matrix \n260487 Y10T483/1882 Rotary disc \n260488 Y10T483/1891 Chain or belt \n260489 Y10T483/19 Miscellaneous \n260490 NaN NaN \n\n section class subclass group main_group cpc_version \n0 A None None None None 2023 \\\n1 A 01 None None None 2023 \n2 A 01 B None None 2023 \n3 A 01 B 1 00 2023 \n4 A 01 B 1 02 2023 \n... ... ... ... ... ... ... \n260486 Y 10 T 483 1873 2023 \n260487 Y 10 T 483 1882 2023 \n260488 Y 10 T 483 1891 2023 \n260489 Y 10 T 483 19 2023 \n260490 NaN NaN NaN NaN NaN 2022 \n\n version https://git-lfs.github.com/spec/v1 \n0 NaN \\\n1 NaN \n2 NaN \n3 NaN \n4 NaN \n... ... \n260486 NaN \n260487 NaN \n260488 NaN \n260489 NaN \n260490 oid sha256:f138d6bdf2939ba576b96b633d81366123b... \n\n cpc_taxonomy data_scope \n0 [(A, HUMAN NECESSITIES)] False \n1 [(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... False \n2 [(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... False \n3 [(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... False \n4 [(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... False \n... ... ... \n260486 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... False \n260487 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... False \n260488 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... False \n260489 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... False \n260490 [] False \n\n[260491 rows x 11 columns]",