You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ZSI_Reconnect_China/PATSTAT/patstat_analysis_pipeline.i...

339 lines
71 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import os\n",
"import pandas as pd\n",
"import janitor\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from matplotlib.ticker import MaxNLocator\n",
"import math\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [],
"source": [
"outdir=\"EU_CH_scope\"\n",
"\n",
"appln = pd.read_csv(f\"{outdir}/tls_201_scope.csv\")\n",
"\n",
"appln_title = pd.read_csv(f\"{outdir}/tls_202_scope.csv\")\n",
"\n",
"pers = pd.read_csv(f\"{outdir}/tls_206_scope.csv\")\n",
"pers['psn_sector'] = pers['psn_sector'].fillna(\"UNKNOWN\")\n",
"\n",
"appln_pers = pd.read_csv(f\"{outdir}/tls_207_scope.csv\")\n",
"\n",
"appln_cpc = pd.read_csv(f\"{outdir}/tls_224_scope.csv\")\n",
"\n",
"cpc_def = pd. read_csv(\"CPC_data/cpc_defs.csv\", low_memory=False)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 13,
"outputs": [
{
"data": {
"text/plain": "65136"
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(appln)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 14,
"outputs": [
{
"data": {
"text/plain": " cpc_id cpc_name \n12725 A61B1/000096 {using artificial intelligence} \\\n13746 A61B5/72 {Signal processing specially adapted for physi... \n13764 A61B5/7264 {Classification of physiological signals or da... \n13897 A61B6/52 {Devices using data or image processing specia... \n14016 A61B8/52 {Devices using data or image processing specia... \n... ... ... \n246159 Y10S128/924 using artificial intelligence \n246160 Y10S128/925 Neural network \n248454 Y10S323/909 Remote sensing \n250570 Y10S706/00 Data processing: artificial intelligence \n250571 Y10S706/90 Fuzzy logic \n\n section class subclass group main_group cpc_version \n12725 A 61.0 B 1.0 96.0 2023 \\\n13746 A 61.0 B 5.0 72.0 2023 \n13764 A 61.0 B 5.0 7264.0 2023 \n13897 A 61.0 B 6.0 52.0 2023 \n14016 A 61.0 B 8.0 52.0 2023 \n... ... ... ... ... ... ... \n246159 Y 10.0 S 128.0 924.0 2023 \n246160 Y 10.0 S 128.0 925.0 2023 \n248454 Y 10.0 S 323.0 909.0 2023 \n250570 Y 10.0 S 706.0 0.0 2023 \n250571 Y 10.0 S 706.0 90.0 2023 \n\n version https://git-lfs.github.com/spec/v1 \n12725 NaN \\\n13746 NaN \n13764 NaN \n13897 NaN \n14016 NaN \n... ... \n246159 NaN \n246160 NaN \n248454 NaN \n250570 NaN \n250571 NaN \n\n cpc_taxonomy \n12725 [('A', 'HUMAN NECESSITIES'), ('A61', 'MEDICAL ... \\\n13746 [('A', 'HUMAN NECESSITIES'), ('A61', 'MEDICAL ... \n13764 [('A', 'HUMAN NECESSITIES'), ('A61', 'MEDICAL ... \n13897 [('A', 'HUMAN NECESSITIES'), ('A61', 'MEDICAL ... \n14016 [('A', 'HUMAN NECESSITIES'), ('A61', 'MEDICAL ... \n... ... \n246159 [('Y', 'GENERAL TAGGING OF NEW TECHNOLOGICAL D... \n246160 [('Y', 'GENERAL TAGGING OF NEW TECHNOLOGICAL D... \n248454 [('Y', 'GENERAL TAGGING OF NEW TECHNOLOGICAL D... \n250570 [('Y', 'GENERAL TAGGING OF NEW TECHNOLOGICAL D... \n250571 [('Y', 'GENERAL TAGGING OF NEW TECHNOLOGICAL D... \n\n cpc_fullname \n12725 HUMAN NECESSITIES<>MEDICAL OR VETERINARY SCIEN... \\\n13746 HUMAN NECESSITIES<>MEDICAL OR VETERINARY SCIEN... \n13764 HUMAN NECESSITIES<>MEDICAL OR VETERINARY SCIEN... \n13897 HUMAN NECESSITIES<>MEDICAL OR VETERINARY SCIEN... \n14016 HUMAN NECESSITIES<>MEDICAL OR VETERINARY SCIEN... \n... ... \n246159 GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... \n246160 GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... \n248454 GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... \n250570 GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... \n250571 GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... \n\n tax_level_0 \n12725 HUMAN NECESSITIES \\\n13746 HUMAN NECESSITIES \n13764
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>cpc_id</th>\n <th>cpc_name</th>\n <th>section</th>\n <th>class</th>\n <th>subclass</th>\n <th>group</th>\n <th>main_group</th>\n <th>cpc_version</th>\n <th>version https://git-lfs.github.com/spec/v1</th>\n <th>cpc_taxonomy</th>\n <th>cpc_fullname</th>\n <th>tax_level_0</th>\n <th>tax_level_1</th>\n <th>tax_level_2</th>\n <th>tax_level_3</th>\n <th>tax_level_4</th>\n <th>tax_level_5</th>\n <th>tax_level_6</th>\n <th>tax_level_7</th>\n <th>data_scope</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>12725</th>\n <td>A61B1/000096</td>\n <td>{using artificial intelligence}</td>\n <td>A</td>\n <td>61.0</td>\n <td>B</td>\n <td>1.0</td>\n <td>96.0</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[('A', 'HUMAN NECESSITIES'), ('A61', 'MEDICAL ...</td>\n <td>HUMAN NECESSITIES&lt;&gt;MEDICAL OR VETERINARY SCIEN...</td>\n <td>HUMAN NECESSITIES</td>\n <td>MEDICAL OR VETERINARY SCIENCE; HYGIENE</td>\n <td>DIAGNOSIS; SURGERY; IDENTIFICATION (analysing ...</td>\n <td>Instruments for performing medical examination...</td>\n <td>{of image signals during a use of endoscope}</td>\n <td>{using artificial intelligence}</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>True</td>\n </tr>\n <tr>\n <th>13746</th>\n <td>A61B5/72</td>\n <td>{Signal processing specially adapted for physi...</td>\n <td>A</td>\n <td>61.0</td>\n <td>B</td>\n <td>5.0</td>\n <td>72.0</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[('A', 'HUMAN NECESSITIES'), ('A61', 'MEDICAL ...</td>\n <td>HUMAN NECESSITIES&lt;&gt;MEDICAL OR VETERINARY SCIEN...</td>\n <td>HUMAN NECESSITIES</td>\n <td>MEDICAL OR VETERINARY SCIENCE; HYGIENE</td>\n <td>DIAGNOSIS; SURGERY; IDENTIFICATION (analysing ...</td>\n <td>{Signal processing specially adapted for physi...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>True</td>\n </tr>\n <tr>\n <th>13764</th>\n <td>A61B5/7264</td>\n <td>{Classification of physiological signals or da...</td>\n <td>A</td>\n <td>61.0</td>\n <td>B</td>\n <td>5.0</td>\n <td>7264.0</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[('A', 'HUMAN NECESSITIES'), ('A61', 'MEDICAL ...</td>\n <td>HUMAN NECESSITIES&lt;&gt;MEDICAL OR VETERINARY SCIEN...</td>\n <td>HUMAN NECESSITIES</td>\n <td>MEDICAL OR VETERINARY SCIENCE; HYGIENE</td>\n <td>DIAGNOSIS; SURGERY; IDENTIFICATION (analysing ...</td>\n <td>{Signal processing specially adapted for physi...</td>\n <td>{using Wavelet transforms}</td>\n <td>{Classification of physiological signals or da...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>True</td>\n </tr>\n <tr>\n <th>13897</th>\n <td>A61B6/52</td>\n <td>{Devices using data or image processing specia...</td>\n <td>A</td>\n <td>61.0</td>\n <td>B</td>\n <td>6.0</td>\n <td>52.0</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[('A', 'HUMAN NECESSITIES'), ('A61', 'MEDICAL ...</td>\n <td>HUMAN NECESSITIES&lt;&gt;MEDICAL OR VETERINARY SCIEN...</td>\n <td>HUMAN NECESSITIES</td>\n <td>MEDICAL OR VETERINARY SCIENCE; HYGIENE</td>\n <td>DIAGNOSIS; SURGERY; IDENTIFICATION (analysing ...</td>\n <td>{Devices using data or image processing specia...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>True</td>\n </tr>\n <tr>\n <th>14016</th>\n <td>A61B8/52</td>\n
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cpc_def[cpc_def[\"data_scope\"]==True]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 16,
"outputs": [
{
"data": {
"text/plain": " appln_id appln_auth appln_nr appln_kind appln_filing_date \n33934 497577215 BR 112014017433 A 2013-01-15 \\\n2981 375177905 TW 100128243 A 2011-08-08 \n34857 500398671 US 201815918757 A 2018-03-12 \n14330 425909090 US 201313948663 A 2013-07-23 \n12337 421949639 US 201313799810 A 2013-03-13 \n... ... ... ... ... ... \n20853 451232832 US 201314892356 A 2013-06-05 \n57178 558607779 WO 2020082459 W 2020-03-31 \n60907 571924211 WO 2022025191 W 2022-04-27 \n36638 505223478 CA 3027451 A 2017-06-14 \n33120 496062856 US 201615739023 A 2016-06-21 \n\n appln_filing_year appln_nr_original ipr_type receiving_office \n33934 2013 112014017433 PI \\\n2981 2011 100128243 PI \n34857 2018 15918757 PI \n14330 2013 13948663 PI \n12337 2013 13799810 PI \n... ... ... ... ... \n20853 2013 14892356 PI \n57178 2020 CN2020/082459 PI CN \n60907 2022 EP2022/025191 PI EP \n36638 2017 3027451 PI \n33120 2016 15739023 PI \n\n internat_appln_id ... earliest_publn_date earliest_publn_year \n33934 379982555 ... 2017-06-13 2017 \\\n2981 0 ... 2012-04-01 2012 \n34857 329543408 ... 2018-09-27 2018 \n14330 0 ... 2015-01-29 2015 \n12337 0 ... 2014-09-18 2014 \n... ... ... ... ... \n20853 424456242 ... 2016-04-14 2016 \n57178 0 ... 2021-10-07 2021 \n60907 0 ... 2022-11-24 2022 \n36638 479992612 ... 2017-12-21 2017 \n33120 473240582 ... 2018-06-21 2018 \n\n earliest_pat_publn_id granted docdb_family_id inpadoc_family_id \n33934 490635945 N 47605490 379982555 \\\n2981 380050378 N 44677979 336552941 \n34857 500398672 Y 43858368 273565445 \n14330 425909091 Y 51167733 419869995 \n12337 421949640 Y 50241294 416000492 \n... ... ... ... ... \n20853 451232833 Y 52007415 424456242 \n57178 558607780 N 77927568 558607779 \n60907 583339295 N 81756758 571924211 \n36638 505223479 N 58241906 476382802 \n33120 496062857 Y 53496595 442016492 \n\n docdb_family_size nb_citing_docdb_fam nb_applicants nb_inventors \n33934 9 6 1 4 \n2981 5 6 1 3 \n34857 12 17 1 3 \n14330 3 19 1 4 \n12337 4 19 1 2 \n... ...
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>appln_id</th>\n <th>appln_auth</th>\n <th>appln_nr</th>\n <th>appln_kind</th>\n <th>appln_filing_date</th>\n <th>appln_filing_year</th>\n <th>appln_nr_original</th>\n <th>ipr_type</th>\n <th>receiving_office</th>\n <th>internat_appln_id</th>\n <th>...</th>\n <th>earliest_publn_date</th>\n <th>earliest_publn_year</th>\n <th>earliest_pat_publn_id</th>\n <th>granted</th>\n <th>docdb_family_id</th>\n <th>inpadoc_family_id</th>\n <th>docdb_family_size</th>\n <th>nb_citing_docdb_fam</th>\n <th>nb_applicants</th>\n <th>nb_inventors</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>33934</th>\n <td>497577215</td>\n <td>BR</td>\n <td>112014017433</td>\n <td>A</td>\n <td>2013-01-15</td>\n <td>2013</td>\n <td>112014017433</td>\n <td>PI</td>\n <td></td>\n <td>379982555</td>\n <td>...</td>\n <td>2017-06-13</td>\n <td>2017</td>\n <td>490635945</td>\n <td>N</td>\n <td>47605490</td>\n <td>379982555</td>\n <td>9</td>\n <td>6</td>\n <td>1</td>\n <td>4</td>\n </tr>\n <tr>\n <th>2981</th>\n <td>375177905</td>\n <td>TW</td>\n <td>100128243</td>\n <td>A</td>\n <td>2011-08-08</td>\n <td>2011</td>\n <td>100128243</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>2012-04-01</td>\n <td>2012</td>\n <td>380050378</td>\n <td>N</td>\n <td>44677979</td>\n <td>336552941</td>\n <td>5</td>\n <td>6</td>\n <td>1</td>\n <td>3</td>\n </tr>\n <tr>\n <th>34857</th>\n <td>500398671</td>\n <td>US</td>\n <td>201815918757</td>\n <td>A</td>\n <td>2018-03-12</td>\n <td>2018</td>\n <td>15918757</td>\n <td>PI</td>\n <td></td>\n <td>329543408</td>\n <td>...</td>\n <td>2018-09-27</td>\n <td>2018</td>\n <td>500398672</td>\n <td>Y</td>\n <td>43858368</td>\n <td>273565445</td>\n <td>12</td>\n <td>17</td>\n <td>1</td>\n <td>3</td>\n </tr>\n <tr>\n <th>14330</th>\n <td>425909090</td>\n <td>US</td>\n <td>201313948663</td>\n <td>A</td>\n <td>2013-07-23</td>\n <td>2013</td>\n <td>13948663</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>2015-01-29</td>\n <td>2015</td>\n <td>425909091</td>\n <td>Y</td>\n <td>51167733</td>\n <td>419869995</td>\n <td>3</td>\n <td>19</td>\n <td>1</td>\n <td>4</td>\n </tr>\n <tr>\n <th>12337</th>\n <td>421949639</td>\n <td>US</td>\n <td>201313799810</td>\n <td>A</td>\n <td>2013-03-13</td>\n <td>2013</td>\n <td>13799810</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>2014-09-18</td>\n <td>2014</td>\n <td>421949640</td>\n <td>Y</td>\n <td>50241294</td>\n <td>416000492</td>\n <td>4</td>\n <td>19</td>\n <td>1</td>\n <td>2</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>20853</th>\n <td>451232832</td>\n <td>U
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"appln.sample(100)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 15,
"outputs": [
{
"data": {
"text/plain": " appln_id appln_title_lg \n46635 531117386 en \\\n44552 526116621 en \n45476 528352131 en \n23619 470837180 en \n33233 496310125 en \n... ... ... \n61597 574732234 en \n54568 551189734 en \n35847 503315942 en \n49995 540002470 en \n14915 437594613 en \n\n appln_title \n46635 FRACTURING OPERATIONS PUMP FLEET BALANCE CONTR... \n44552 SHEETLIKE COMPOSITE, IN PARTICULAR FOR THE PRO... \n45476 NATURALLY SWEET ENHANCER COMPOSITION \n23619 METHOD AND APPARATUS FOR MONITORING RADIO LINK... \n33233 METHOD AND DEVICE FOR CONNECTING TO ACCESS POI... \n... ... \n61597 ADJUSTMENT MECHANISM FOR CIRCUIT BREAKER AND C... \n54568 METHODS FOR PDCCH MONITORING, USER EQUIPMENT, ... \n35847 SINGLE-WHEEL DRIVE COMPONENT FOR A MOTOR VEHIC... \n49995 METHODS PROVIDING V2X APPLICATION SERVER REGIS... \n14915 Carbon dots (c dots), method for their prepara... \n\n[100 rows x 3 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>appln_id</th>\n <th>appln_title_lg</th>\n <th>appln_title</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>46635</th>\n <td>531117386</td>\n <td>en</td>\n <td>FRACTURING OPERATIONS PUMP FLEET BALANCE CONTR...</td>\n </tr>\n <tr>\n <th>44552</th>\n <td>526116621</td>\n <td>en</td>\n <td>SHEETLIKE COMPOSITE, IN PARTICULAR FOR THE PRO...</td>\n </tr>\n <tr>\n <th>45476</th>\n <td>528352131</td>\n <td>en</td>\n <td>NATURALLY SWEET ENHANCER COMPOSITION</td>\n </tr>\n <tr>\n <th>23619</th>\n <td>470837180</td>\n <td>en</td>\n <td>METHOD AND APPARATUS FOR MONITORING RADIO LINK...</td>\n </tr>\n <tr>\n <th>33233</th>\n <td>496310125</td>\n <td>en</td>\n <td>METHOD AND DEVICE FOR CONNECTING TO ACCESS POI...</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>61597</th>\n <td>574732234</td>\n <td>en</td>\n <td>ADJUSTMENT MECHANISM FOR CIRCUIT BREAKER AND C...</td>\n </tr>\n <tr>\n <th>54568</th>\n <td>551189734</td>\n <td>en</td>\n <td>METHODS FOR PDCCH MONITORING, USER EQUIPMENT, ...</td>\n </tr>\n <tr>\n <th>35847</th>\n <td>503315942</td>\n <td>en</td>\n <td>SINGLE-WHEEL DRIVE COMPONENT FOR A MOTOR VEHIC...</td>\n </tr>\n <tr>\n <th>49995</th>\n <td>540002470</td>\n <td>en</td>\n <td>METHODS PROVIDING V2X APPLICATION SERVER REGIS...</td>\n </tr>\n <tr>\n <th>14915</th>\n <td>437594613</td>\n <td>en</td>\n <td>Carbon dots (c dots), method for their prepara...</td>\n </tr>\n </tbody>\n</table>\n<p>100 rows × 3 columns</p>\n</div>"
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"appln_title.sample(100)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 23,
"outputs": [
{
"data": {
"text/plain": " appln_id appln_auth appln_nr appln_kind appln_filing_date \n0 330225325 EP 11150195 A 2011-01-05 \\\n1 330225397 EP 11150231 A 2011-01-05 \n2 330322632 EP 11150485 A 2011-01-10 \n3 330326785 EP 11150605 A 2011-01-11 \n4 330350961 EP 11150683 A 2011-01-12 \n\n appln_filing_year appln_nr_original ipr_type receiving_office \n0 2011 11150195 PI \\\n1 2011 11150231 PI \n2 2011 11150485 PI \n3 2011 11150605 PI \n4 2011 11150683 PI \n\n internat_appln_id ... earliest_pat_publn_id granted docdb_family_id \n0 0 ... 335277427 Y 43754737 \\\n1 0 ... 335277736 Y 43619902 \n2 0 ... 364719889 Y 43991052 \n3 0 ... 335277720 N 43023665 \n4 0 ... 364923578 N 43881056 \n\n inpadoc_family_id docdb_family_size nb_citing_docdb_fam nb_applicants \n0 330225325 4 16 1 \\\n1 330225397 6 56 1 \n2 330322632 2 5 1 \n3 328518903 6 9 1 \n4 330350961 7 13 2 \n\n nb_inventors appln_title_lg \n0 1 en \\\n1 9 en \n2 2 en \n3 3 en \n4 5 en \n\n appln_title \n0 Beverage preparation machine \n1 Screwdriving tool having a driving tool with a... \n2 Method and system for recommending contextual ... \n3 Apparatus and method for continuous casting of... \n4 A method and an apparatus for treating at leas... \n\n[5 rows x 28 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>appln_id</th>\n <th>appln_auth</th>\n <th>appln_nr</th>\n <th>appln_kind</th>\n <th>appln_filing_date</th>\n <th>appln_filing_year</th>\n <th>appln_nr_original</th>\n <th>ipr_type</th>\n <th>receiving_office</th>\n <th>internat_appln_id</th>\n <th>...</th>\n <th>earliest_pat_publn_id</th>\n <th>granted</th>\n <th>docdb_family_id</th>\n <th>inpadoc_family_id</th>\n <th>docdb_family_size</th>\n <th>nb_citing_docdb_fam</th>\n <th>nb_applicants</th>\n <th>nb_inventors</th>\n <th>appln_title_lg</th>\n <th>appln_title</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>330225325</td>\n <td>EP</td>\n <td>11150195</td>\n <td>A</td>\n <td>2011-01-05</td>\n <td>2011</td>\n <td>11150195</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>335277427</td>\n <td>Y</td>\n <td>43754737</td>\n <td>330225325</td>\n <td>4</td>\n <td>16</td>\n <td>1</td>\n <td>1</td>\n <td>en</td>\n <td>Beverage preparation machine</td>\n </tr>\n <tr>\n <th>1</th>\n <td>330225397</td>\n <td>EP</td>\n <td>11150231</td>\n <td>A</td>\n <td>2011-01-05</td>\n <td>2011</td>\n <td>11150231</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>335277736</td>\n <td>Y</td>\n <td>43619902</td>\n <td>330225397</td>\n <td>6</td>\n <td>56</td>\n <td>1</td>\n <td>9</td>\n <td>en</td>\n <td>Screwdriving tool having a driving tool with a...</td>\n </tr>\n <tr>\n <th>2</th>\n <td>330322632</td>\n <td>EP</td>\n <td>11150485</td>\n <td>A</td>\n <td>2011-01-10</td>\n <td>2011</td>\n <td>11150485</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>364719889</td>\n <td>Y</td>\n <td>43991052</td>\n <td>330322632</td>\n <td>2</td>\n <td>5</td>\n <td>1</td>\n <td>2</td>\n <td>en</td>\n <td>Method and system for recommending contextual ...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>330326785</td>\n <td>EP</td>\n <td>11150605</td>\n <td>A</td>\n <td>2011-01-11</td>\n <td>2011</td>\n <td>11150605</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>335277720</td>\n <td>N</td>\n <td>43023665</td>\n <td>328518903</td>\n <td>6</td>\n <td>9</td>\n <td>1</td>\n <td>3</td>\n <td>en</td>\n <td>Apparatus and method for continuous casting of...</td>\n </tr>\n <tr>\n <th>4</th>\n <td>330350961</td>\n <td>EP</td>\n <td>11150683</td>\n <td>A</td>\n <td>2011-01-12</td>\n <td>2011</td>\n <td>11150683</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>364923578</td>\n <td>N</td>\n <td>43881056</td>\n <td>330350961</td>\n <td>7</td>\n <td>13</td>\n <td>2</td>\n <td>5</td>\n <td>en</td>\n <td>A method and an apparatus for treating at leas...</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 28 columns</p>\n</div>"
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"appln_data = appln.merge(appln_title, on=\"appln_id\")\n",
"appln_data.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 24,
"outputs": [
{
"data": {
"text/plain": "array(['EP', 'WO', 'LU', 'FI', 'NO', 'FR', 'GB', 'KR', 'ES', 'US', 'CA',\n 'DO', 'EC', 'DE', 'UY', 'IL', 'SV', 'PL', 'TR', 'CO', 'CR', 'TW',\n 'MA', 'PE', 'SG', 'CU', 'BE', 'DK', 'AR', 'AP', 'HR', 'MX', 'BR',\n 'EA', 'RU', 'AU', 'MC', 'HU', 'PT', 'NL', 'HN', 'AT', 'RO', 'SM',\n 'CH', 'SI', 'IS', 'CZ', 'HK', 'MD', 'JP', 'CN', 'RS', 'GT', 'UA',\n 'CL', 'SK', 'LT', 'PH', 'MY', 'IN', 'VN', 'TN', 'CY', 'GE', 'ZA',\n 'SE', 'ME', 'JO', 'NI', 'SA'], dtype=object)"
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"appln_data[\"appln_auth\"].unique()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 25,
"outputs": [
{
"data": {
"text/plain": " person_id appln_id applt_seq_nr invt_seq_nr\n0 1 413601768 1 0\n1 21 332015605 1 0\n2 21 333490084 1 0\n3 21 335903805 1 0\n4 76 352908776 1 0\n... ... ... ... ...\n1025446 88836321 577982223 1 0\n1025447 88836333 583342135 0 4\n1025448 88836333 583342207 0 3\n1025449 88836333 585957705 0 5\n1025450 88836337 579601496 0 1\n\n[1025451 rows x 4 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>person_id</th>\n <th>appln_id</th>\n <th>applt_seq_nr</th>\n <th>invt_seq_nr</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>413601768</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>21</td>\n <td>332015605</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>21</td>\n <td>333490084</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>21</td>\n <td>335903805</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>76</td>\n <td>352908776</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>1025446</th>\n <td>88836321</td>\n <td>577982223</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1025447</th>\n <td>88836333</td>\n <td>583342135</td>\n <td>0</td>\n <td>4</td>\n </tr>\n <tr>\n <th>1025448</th>\n <td>88836333</td>\n <td>583342207</td>\n <td>0</td>\n <td>3</td>\n </tr>\n <tr>\n <th>1025449</th>\n <td>88836333</td>\n <td>585957705</td>\n <td>0</td>\n <td>5</td>\n </tr>\n <tr>\n <th>1025450</th>\n <td>88836337</td>\n <td>579601496</td>\n <td>0</td>\n <td>1</td>\n </tr>\n </tbody>\n</table>\n<p>1025451 rows × 4 columns</p>\n</div>"
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"appln_pers"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 26,
"outputs": [
{
"data": {
"text/plain": " person_id person_name person_name_orig_lg \n0 1 Nokia Corporation Nokia Corporation \\\n1 128 Nokia Siemens Networks Oy Nokia Siemens Networks Oy \n2 5217785 Nokia Corporation Nokia Corporation \n3 5217811 Nokia Corporation Nokia Corporation \n4 5232170 Nokia Siemens Networks Oy Nokia Siemens Networks Oy \n... ... ... ... \n354633 88836234 WONG, Chun Lok WONG, Chun Lok \n354634 88836257 XIAONING YE XIAONING YE \n354635 88836321 ZAI LAB (US) LLC ZAI LAB (US) LLC \n354636 88836333 ZHANG, Haocheng 张皓程 \n354637 88836337 ZHANG, Yangjun ZHANG, Yangjun \n\n person_address person_ctry_code nuts nuts_level \n0 Keilalahdentie 4,02150 Espoo FI FI1B1 3 \\\n1 Karaportti 3,02610 Espoo FI FI1B1 3 \n2 Espoo FI FI 0 \n3 NaN FI FI 0 \n4 Espoo FI FI 0 \n... ... ... ... ... \n354633 NaN US NaN 9 \n354634 Portland, Oregon US US NaN 9 \n354635 NaN US NaN 9 \n354636 NaN US NaN 9 \n354637 NaN US NaN 9 \n\n doc_std_name_id doc_std_name psn_id \n0 1 NOKIA CORP 23782051 \\\n1 112 NOKIA SIEMENS NETWORKS OY 23782129 \n2 1 NOKIA CORP 23782051 \n3 1 NOKIA CORP 23782051 \n4 112 NOKIA SIEMENS NETWORKS OY 23782129 \n... ... ... ... \n354633 30867225 WONG CHUN LOK 188836234 \n354634 8004293 XIAONING YE 188836257 \n354635 39363494 ZAI LAB US LLC 188836321 \n354636 7682590 ZHANG HAOCHENG 188836333 \n354637 2112344 ZHANG YANGJUN 188836337 \n\n psn_name psn_level psn_sector han_id han_name \n0 NOKIA CORPORATION 2 COMPANY 2125445 NOKIA CORP \\\n1 NOKIA NETWORKS 2 COMPANY 2125445 NOKIA CORP \n2 NOKIA CORPORATION 2 COMPANY 2125445 NOKIA CORP \n3 NOKIA CORPORATION 2 COMPANY 2125445 NOKIA CORP \n4 NOKIA NETWORKS 2 COMPANY 2125445 NOKIA CORP \n... ... ... ... ... ... \n354633 WONG, Chun Lok 0 UNKNOWN 188836234 WONG, Chun Lok \n354634 XIAONING YE 0 UNKNOWN 188836257 XIAONING YE \n354635 ZAI LAB (US) LLC 0 UNKNOWN 188836321 ZAI LAB (US) LLC \n354636 ZHANG, Haocheng 0 UNKNOWN 188836333 ZHANG, Haocheng \n354637 ZHANG, Yangjun 0 UNKNOWN 188836337 ZHANG, Yangjun \n\n han_harmonized psn_sector_primary \n0 2 COMPANY \n1 2 COMPANY \n2 2 COMPANY \n3 2 COMPANY \n4 2 COMPANY \n... ... ... \n354633 0 UNKNOWN \n354634 0 UNKNOWN \n354635 0 UNKNOWN \n354636
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>person_id</th>\n <th>person_name</th>\n <th>person_name_orig_lg</th>\n <th>person_address</th>\n <th>person_ctry_code</th>\n <th>nuts</th>\n <th>nuts_level</th>\n <th>doc_std_name_id</th>\n <th>doc_std_name</th>\n <th>psn_id</th>\n <th>psn_name</th>\n <th>psn_level</th>\n <th>psn_sector</th>\n <th>han_id</th>\n <th>han_name</th>\n <th>han_harmonized</th>\n <th>psn_sector_primary</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>Nokia Corporation</td>\n <td>Nokia Corporation</td>\n <td>Keilalahdentie 4,02150 Espoo</td>\n <td>FI</td>\n <td>FI1B1</td>\n <td>3</td>\n <td>1</td>\n <td>NOKIA CORP</td>\n <td>23782051</td>\n <td>NOKIA CORPORATION</td>\n <td>2</td>\n <td>COMPANY</td>\n <td>2125445</td>\n <td>NOKIA CORP</td>\n <td>2</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>1</th>\n <td>128</td>\n <td>Nokia Siemens Networks Oy</td>\n <td>Nokia Siemens Networks Oy</td>\n <td>Karaportti 3,02610 Espoo</td>\n <td>FI</td>\n <td>FI1B1</td>\n <td>3</td>\n <td>112</td>\n <td>NOKIA SIEMENS NETWORKS OY</td>\n <td>23782129</td>\n <td>NOKIA NETWORKS</td>\n <td>2</td>\n <td>COMPANY</td>\n <td>2125445</td>\n <td>NOKIA CORP</td>\n <td>2</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>2</th>\n <td>5217785</td>\n <td>Nokia Corporation</td>\n <td>Nokia Corporation</td>\n <td>Espoo</td>\n <td>FI</td>\n <td>FI</td>\n <td>0</td>\n <td>1</td>\n <td>NOKIA CORP</td>\n <td>23782051</td>\n <td>NOKIA CORPORATION</td>\n <td>2</td>\n <td>COMPANY</td>\n <td>2125445</td>\n <td>NOKIA CORP</td>\n <td>2</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>3</th>\n <td>5217811</td>\n <td>Nokia Corporation</td>\n <td>Nokia Corporation</td>\n <td>NaN</td>\n <td>FI</td>\n <td>FI</td>\n <td>0</td>\n <td>1</td>\n <td>NOKIA CORP</td>\n <td>23782051</td>\n <td>NOKIA CORPORATION</td>\n <td>2</td>\n <td>COMPANY</td>\n <td>2125445</td>\n <td>NOKIA CORP</td>\n <td>2</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>4</th>\n <td>5232170</td>\n <td>Nokia Siemens Networks Oy</td>\n <td>Nokia Siemens Networks Oy</td>\n <td>Espoo</td>\n <td>FI</td>\n <td>FI</td>\n <td>0</td>\n <td>112</td>\n <td>NOKIA SIEMENS NETWORKS OY</td>\n <td>23782129</td>\n <td>NOKIA NETWORKS</td>\n <td>2</td>\n <td>COMPANY</td>\n <td>2125445</td>\n <td>NOKIA CORP</td>\n <td>2</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>354633</th>\n <td>88836234</td>\n <td>WONG, Chun Lok</td>\n <td>WONG, Chun Lok</td>\n <td>NaN</td>\n <td>US</td>\n <td>NaN</td>\n <td>9</td>\n <td>30867225</td>\n <td>WONG CHUN LOK</td>\n <td>188836234</td>\n <td>WONG, Chun Lok</td>\n <td>0</td>\n <td>UNKNOWN</td>\n <td>188836234</td>\n <td>WONG, Chun Lok</td>\n <td>0</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>354634<
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": " person_id person_name person_name_orig_lg \n0 1 Nokia Corporation Nokia Corporation \\\n1 128 Nokia Siemens Networks Oy Nokia Siemens Networks Oy \n2 5217785 Nokia Corporation Nokia Corporation \n3 5217811 Nokia Corporation Nokia Corporation \n4 5232170 Nokia Siemens Networks Oy Nokia Siemens Networks Oy \n... ... ... ... \n354633 88836234 WONG, Chun Lok WONG, Chun Lok \n354634 88836257 XIAONING YE XIAONING YE \n354635 88836321 ZAI LAB (US) LLC ZAI LAB (US) LLC \n354636 88836333 ZHANG, Haocheng 张皓程 \n354637 88836337 ZHANG, Yangjun ZHANG, Yangjun \n\n person_address person_ctry_code nuts nuts_level \n0 Keilalahdentie 4,02150 Espoo FI FI1B1 3 \\\n1 Karaportti 3,02610 Espoo FI FI1B1 3 \n2 Espoo FI FI 0 \n3 NaN FI FI 0 \n4 Espoo FI FI 0 \n... ... ... ... ... \n354633 NaN US NaN 9 \n354634 Portland, Oregon US US NaN 9 \n354635 NaN US NaN 9 \n354636 NaN US NaN 9 \n354637 NaN US NaN 9 \n\n doc_std_name_id doc_std_name psn_id \n0 1 NOKIA CORP 23782051 \\\n1 112 NOKIA SIEMENS NETWORKS OY 23782129 \n2 1 NOKIA CORP 23782051 \n3 1 NOKIA CORP 23782051 \n4 112 NOKIA SIEMENS NETWORKS OY 23782129 \n... ... ... ... \n354633 30867225 WONG CHUN LOK 188836234 \n354634 8004293 XIAONING YE 188836257 \n354635 39363494 ZAI LAB US LLC 188836321 \n354636 7682590 ZHANG HAOCHENG 188836333 \n354637 2112344 ZHANG YANGJUN 188836337 \n\n psn_name psn_level psn_sector han_id han_name \n0 NOKIA CORPORATION 2 COMPANY 2125445 NOKIA CORP \\\n1 NOKIA NETWORKS 2 COMPANY 2125445 NOKIA CORP \n2 NOKIA CORPORATION 2 COMPANY 2125445 NOKIA CORP \n3 NOKIA CORPORATION 2 COMPANY 2125445 NOKIA CORP \n4 NOKIA NETWORKS 2 COMPANY 2125445 NOKIA CORP \n... ... ... ... ... ... \n354633 WONG, Chun Lok 0 UNKNOWN 188836234 WONG, Chun Lok \n354634 XIAONING YE 0 UNKNOWN 188836257 XIAONING YE \n354635 ZAI LAB (US) LLC 0 UNKNOWN 188836321 ZAI LAB (US) LLC \n354636 ZHANG, Haocheng 0 UNKNOWN 188836333 ZHANG, Haocheng \n354637 ZHANG, Yangjun 0 UNKNOWN 188836337 ZHANG, Yangjun \n\n han_harmonized psn_sector_primary \n0 2 COMPANY \n1 2 COMPANY \n2 2 COMPANY \n3 2 COMPANY \n4 2 COMPANY \n... ... ... \n354633 0 UNKNOWN \n354634 0 UNKNOWN \n354635 0 UNKNOWN \n354636
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>person_id</th>\n <th>person_name</th>\n <th>person_name_orig_lg</th>\n <th>person_address</th>\n <th>person_ctry_code</th>\n <th>nuts</th>\n <th>nuts_level</th>\n <th>doc_std_name_id</th>\n <th>doc_std_name</th>\n <th>psn_id</th>\n <th>psn_name</th>\n <th>psn_level</th>\n <th>psn_sector</th>\n <th>han_id</th>\n <th>han_name</th>\n <th>han_harmonized</th>\n <th>psn_sector_primary</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>Nokia Corporation</td>\n <td>Nokia Corporation</td>\n <td>Keilalahdentie 4,02150 Espoo</td>\n <td>FI</td>\n <td>FI1B1</td>\n <td>3</td>\n <td>1</td>\n <td>NOKIA CORP</td>\n <td>23782051</td>\n <td>NOKIA CORPORATION</td>\n <td>2</td>\n <td>COMPANY</td>\n <td>2125445</td>\n <td>NOKIA CORP</td>\n <td>2</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>1</th>\n <td>128</td>\n <td>Nokia Siemens Networks Oy</td>\n <td>Nokia Siemens Networks Oy</td>\n <td>Karaportti 3,02610 Espoo</td>\n <td>FI</td>\n <td>FI1B1</td>\n <td>3</td>\n <td>112</td>\n <td>NOKIA SIEMENS NETWORKS OY</td>\n <td>23782129</td>\n <td>NOKIA NETWORKS</td>\n <td>2</td>\n <td>COMPANY</td>\n <td>2125445</td>\n <td>NOKIA CORP</td>\n <td>2</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>2</th>\n <td>5217785</td>\n <td>Nokia Corporation</td>\n <td>Nokia Corporation</td>\n <td>Espoo</td>\n <td>FI</td>\n <td>FI</td>\n <td>0</td>\n <td>1</td>\n <td>NOKIA CORP</td>\n <td>23782051</td>\n <td>NOKIA CORPORATION</td>\n <td>2</td>\n <td>COMPANY</td>\n <td>2125445</td>\n <td>NOKIA CORP</td>\n <td>2</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>3</th>\n <td>5217811</td>\n <td>Nokia Corporation</td>\n <td>Nokia Corporation</td>\n <td>NaN</td>\n <td>FI</td>\n <td>FI</td>\n <td>0</td>\n <td>1</td>\n <td>NOKIA CORP</td>\n <td>23782051</td>\n <td>NOKIA CORPORATION</td>\n <td>2</td>\n <td>COMPANY</td>\n <td>2125445</td>\n <td>NOKIA CORP</td>\n <td>2</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>4</th>\n <td>5232170</td>\n <td>Nokia Siemens Networks Oy</td>\n <td>Nokia Siemens Networks Oy</td>\n <td>Espoo</td>\n <td>FI</td>\n <td>FI</td>\n <td>0</td>\n <td>112</td>\n <td>NOKIA SIEMENS NETWORKS OY</td>\n <td>23782129</td>\n <td>NOKIA NETWORKS</td>\n <td>2</td>\n <td>COMPANY</td>\n <td>2125445</td>\n <td>NOKIA CORP</td>\n <td>2</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>354633</th>\n <td>88836234</td>\n <td>WONG, Chun Lok</td>\n <td>WONG, Chun Lok</td>\n <td>NaN</td>\n <td>US</td>\n <td>NaN</td>\n <td>9</td>\n <td>30867225</td>\n <td>WONG CHUN LOK</td>\n <td>188836234</td>\n <td>WONG, Chun Lok</td>\n <td>0</td>\n <td>UNKNOWN</td>\n <td>188836234</td>\n <td>WONG, Chun Lok</td>\n <td>0</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>354634<
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pers_sector_primary = pers.groupby(\"han_id\", as_index=False)[\"psn_sector\"].agg(\n",
" lambda x: pd.Series.mode(x)[0]).rename(columns={\"psn_sector\":\"psn_sector_primary\"})\n",
"persn = pers.merge(pers_sector_primary, on='han_id')\n",
"persn"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 27,
"outputs": [
{
"data": {
"text/plain": " han_id psn_sector_primary\n0 32 COMPANY\n1 54 COMPANY\n2 83 COMPANY\n3 200 COMPANY\n4 264 GOV NON-PROFIT UNIVERSITY\n... ... ...\n335519 188836234 UNKNOWN\n335520 188836257 UNKNOWN\n335521 188836321 UNKNOWN\n335522 188836333 UNKNOWN\n335523 188836337 UNKNOWN\n\n[335524 rows x 2 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>han_id</th>\n <th>psn_sector_primary</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>32</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>1</th>\n <td>54</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>2</th>\n <td>83</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>3</th>\n <td>200</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>4</th>\n <td>264</td>\n <td>GOV NON-PROFIT UNIVERSITY</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>335519</th>\n <td>188836234</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>335520</th>\n <td>188836257</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>335521</th>\n <td>188836321</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>335522</th>\n <td>188836333</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>335523</th>\n <td>188836337</td>\n <td>UNKNOWN</td>\n </tr>\n </tbody>\n</table>\n<p>335524 rows × 2 columns</p>\n</div>"
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": " han_id psn_sector_primary\n0 32 COMPANY\n1 54 COMPANY\n2 83 COMPANY\n3 200 COMPANY\n4 264 GOV NON-PROFIT UNIVERSITY\n... ... ...\n335519 188836234 UNKNOWN\n335520 188836257 UNKNOWN\n335521 188836321 UNKNOWN\n335522 188836333 UNKNOWN\n335523 188836337 UNKNOWN\n\n[335524 rows x 2 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>han_id</th>\n <th>psn_sector_primary</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>32</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>1</th>\n <td>54</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>2</th>\n <td>83</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>3</th>\n <td>200</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>4</th>\n <td>264</td>\n <td>GOV NON-PROFIT UNIVERSITY</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>335519</th>\n <td>188836234</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>335520</th>\n <td>188836257</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>335521</th>\n <td>188836321</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>335522</th>\n <td>188836333</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>335523</th>\n <td>188836337</td>\n <td>UNKNOWN</td>\n </tr>\n </tbody>\n</table>\n<p>335524 rows × 2 columns</p>\n</div>"
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pers_sector_primary"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 27,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 28,
"outputs": [],
"source": [
"appln_merge = appln.merge(appln_title, on=\"appln_id\")#.merge(appln_pers,on=\"appln_id\")\n",
"appln_merge.to_excel(\"appln_data.xlsx\", index=False)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 29,
"outputs": [],
"source": [
"person_merge = appln_pers.merge(pers,on=\"person_id\")\n",
"person_merge.to_excel(\"person_data.xlsx\", index=False)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 30,
"outputs": [
{
"data": {
"text/plain": "array(['FI', 'NL', 'FR', 'CH', 'US', 'DE', 'DK', 'AT', 'SE', 'BE', 'CN',\n 'IT', 'LU', 'IE', 'SI', 'HK', 'MO', 'CZ', 'ES', 'NO', 'PL', 'HU',\n 'CY', 'SK', 'PT', 'EE', 'MT', 'GR', 'RO', 'BG', 'LT', 'HR', 'LV'],\n dtype=object)"
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pers[\"person_ctry_code\"].unique()"
],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}