You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ZSI_Reconnect_China/PATSTAT/patstat_analysis_pipeline.i...

293 lines
45 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import os\n",
"import pandas as pd\n",
"import janitor\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from matplotlib.ticker import MaxNLocator\n",
"import math\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 33,
"outputs": [],
"source": [
"outdir=\"EU_CH_scope/v2_\"\n",
"\n",
"appln = pd.read_csv(f\"{outdir}/tls_201_scope.csv\")\n",
"\n",
"appln_title = pd.read_csv(f\"{outdir}/tls_202_scope.csv\")\n",
"\n",
"pers = pd.read_csv(f\"{outdir}/tls_206_scope.csv\")\n",
"pers['psn_sector'] = pers['psn_sector'].fillna(\"UNKNOWN\")\n",
"\n",
"appln_pers = pd.read_csv(f\"{outdir}/tls_207_scope.csv\")\n",
"\n",
"appln_cpc = pd.read_csv(f\"{outdir}/tls_224_scope.csv\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 23,
"outputs": [
{
"data": {
"text/plain": " appln_id appln_auth appln_nr appln_kind appln_filing_date \n0 330225325 EP 11150195 A 2011-01-05 \\\n1 330322632 EP 11150485 A 2011-01-10 \n2 330350961 EP 11150683 A 2011-01-12 \n3 330374780 WO 2011050339 W 2011-01-12 \n4 330424360 WO 2011050199 W 2011-01-10 \n... ... ... ... ... ... \n64261 575551871 WO 2020142401 W 2020-12-31 \n64262 575551946 WO 2020142230 W 2020-12-31 \n64263 575553943 WO 2021142692 W 2021-12-29 \n64264 575553975 WO 2021142655 W 2021-12-29 \n64265 575556091 WO 2021064274 W 2021-12-20 \n\n appln_filing_year appln_nr_original ipr_type receiving_office \n0 2011 11150195 PI \\\n1 2011 11150485 PI \n2 2011 11150683 PI \n3 2011 EP2011/050339 PI EP \n4 2011 EP2011/050199 PI EP \n... ... ... ... ... \n64261 2020 CN2020/142401 PI CN \n64262 2020 CN2020/142230 PI CN \n64263 2021 CN2021/142692 PI CN \n64264 2021 CN2021/142655 PI CN \n64265 2021 US2021/064274 PI US \n\n internat_appln_id ... earliest_publn_date earliest_publn_year \n0 0 ... 2011-07-13 2011 \\\n1 0 ... 2012-07-11 2012 \n2 0 ... 2012-07-18 2012 \n3 0 ... 2011-07-21 2011 \n4 0 ... 2012-07-19 2012 \n... ... ... ... ... \n64261 0 ... 2022-07-07 2022 \n64262 0 ... 2022-07-07 2022 \n64263 0 ... 2022-07-07 2022 \n64264 0 ... 2022-07-07 2022 \n64265 0 ... 2022-07-07 2022 \n\n earliest_pat_publn_id granted docdb_family_id inpadoc_family_id \n0 335277427 Y 43754737 330225325 \\\n1 364719889 Y 43991052 330322632 \n2 364923578 N 43881056 330350961 \n3 335927718 N 43923624 330374780 \n4 365345607 N 43533009 330424360 \n... ... ... ... ... \n64261 575551872 N 82260109 575551871 \n64262 575551947 N 82260125 575551946 \n64263 575553944 N 79460210 564546189 \n64264 575553976 N 82260272 575553975 \n64265 575556092 N 82132815 575038927 \n\n docdb_family_size nb_citing_docdb_fam nb_applicants nb_inventors \n0 4 16 1 1 \n1 2 5 1 2 \n2 7 12 2 5 \n3 2 8 5 4 \n4 4 13 3 2 \n... ... ...
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>appln_id</th>\n <th>appln_auth</th>\n <th>appln_nr</th>\n <th>appln_kind</th>\n <th>appln_filing_date</th>\n <th>appln_filing_year</th>\n <th>appln_nr_original</th>\n <th>ipr_type</th>\n <th>receiving_office</th>\n <th>internat_appln_id</th>\n <th>...</th>\n <th>earliest_publn_date</th>\n <th>earliest_publn_year</th>\n <th>earliest_pat_publn_id</th>\n <th>granted</th>\n <th>docdb_family_id</th>\n <th>inpadoc_family_id</th>\n <th>docdb_family_size</th>\n <th>nb_citing_docdb_fam</th>\n <th>nb_applicants</th>\n <th>nb_inventors</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>330225325</td>\n <td>EP</td>\n <td>11150195</td>\n <td>A</td>\n <td>2011-01-05</td>\n <td>2011</td>\n <td>11150195</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>2011-07-13</td>\n <td>2011</td>\n <td>335277427</td>\n <td>Y</td>\n <td>43754737</td>\n <td>330225325</td>\n <td>4</td>\n <td>16</td>\n <td>1</td>\n <td>1</td>\n </tr>\n <tr>\n <th>1</th>\n <td>330322632</td>\n <td>EP</td>\n <td>11150485</td>\n <td>A</td>\n <td>2011-01-10</td>\n <td>2011</td>\n <td>11150485</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>2012-07-11</td>\n <td>2012</td>\n <td>364719889</td>\n <td>Y</td>\n <td>43991052</td>\n <td>330322632</td>\n <td>2</td>\n <td>5</td>\n <td>1</td>\n <td>2</td>\n </tr>\n <tr>\n <th>2</th>\n <td>330350961</td>\n <td>EP</td>\n <td>11150683</td>\n <td>A</td>\n <td>2011-01-12</td>\n <td>2011</td>\n <td>11150683</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>2012-07-18</td>\n <td>2012</td>\n <td>364923578</td>\n <td>N</td>\n <td>43881056</td>\n <td>330350961</td>\n <td>7</td>\n <td>12</td>\n <td>2</td>\n <td>5</td>\n </tr>\n <tr>\n <th>3</th>\n <td>330374780</td>\n <td>WO</td>\n <td>2011050339</td>\n <td>W</td>\n <td>2011-01-12</td>\n <td>2011</td>\n <td>EP2011/050339</td>\n <td>PI</td>\n <td>EP</td>\n <td>0</td>\n <td>...</td>\n <td>2011-07-21</td>\n <td>2011</td>\n <td>335927718</td>\n <td>N</td>\n <td>43923624</td>\n <td>330374780</td>\n <td>2</td>\n <td>8</td>\n <td>5</td>\n <td>4</td>\n </tr>\n <tr>\n <th>4</th>\n <td>330424360</td>\n <td>WO</td>\n <td>2011050199</td>\n <td>W</td>\n <td>2011-01-10</td>\n <td>2011</td>\n <td>EP2011/050199</td>\n <td>PI</td>\n <td>EP</td>\n <td>0</td>\n <td>...</td>\n <td>2012-07-19</td>\n <td>2012</td>\n <td>365345607</td>\n <td>N</td>\n <td>43533009</td>\n <td>330424360</td>\n <td>4</td>\n <td>13</td>\n <td>3</td>\n <td>2</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>64261</th>\n <td>575551871</td>\n <td>WO</td>\n <td>2020142401</td>\n
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"appln"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 22,
"outputs": [
{
"data": {
"text/plain": " appln_id appln_title_lg \n0 330225325 en \\\n1 330322632 en \n2 330350961 en \n3 330374780 en \n4 330424360 en \n... ... ... \n64258 575551871 en \n64259 575551946 en \n64260 575553943 en \n64261 575553975 en \n64262 575556091 en \n\n appln_title \n0 Beverage preparation machine \n1 Method and system for recommending contextual ... \n2 A method and an apparatus for treating at leas... \n3 A METHOD FOR DIAGNOSIS OF FAULT IN VEHICULAR W... \n4 ERROR CONTROL IN A COMMUNICATION SYSTEM \n... ... \n64258 IMAGE STITCHING METHOD AND APPARATUS, AND COMP... \n64259 LOW VOC AND FOOD GRADE RESEALABLE LABEL \n64260 METHOD, DEVICE, COMPUTER READABLE MEDIUM, AND ... \n64261 MULTISPECIFIC ANTIGEN BINDING PROTEINS \n64262 SYSTEM AND METHOD FOR METHANE HYDRATE BASED PR... \n\n[64263 rows x 3 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>appln_id</th>\n <th>appln_title_lg</th>\n <th>appln_title</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>330225325</td>\n <td>en</td>\n <td>Beverage preparation machine</td>\n </tr>\n <tr>\n <th>1</th>\n <td>330322632</td>\n <td>en</td>\n <td>Method and system for recommending contextual ...</td>\n </tr>\n <tr>\n <th>2</th>\n <td>330350961</td>\n <td>en</td>\n <td>A method and an apparatus for treating at leas...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>330374780</td>\n <td>en</td>\n <td>A METHOD FOR DIAGNOSIS OF FAULT IN VEHICULAR W...</td>\n </tr>\n <tr>\n <th>4</th>\n <td>330424360</td>\n <td>en</td>\n <td>ERROR CONTROL IN A COMMUNICATION SYSTEM</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>64258</th>\n <td>575551871</td>\n <td>en</td>\n <td>IMAGE STITCHING METHOD AND APPARATUS, AND COMP...</td>\n </tr>\n <tr>\n <th>64259</th>\n <td>575551946</td>\n <td>en</td>\n <td>LOW VOC AND FOOD GRADE RESEALABLE LABEL</td>\n </tr>\n <tr>\n <th>64260</th>\n <td>575553943</td>\n <td>en</td>\n <td>METHOD, DEVICE, COMPUTER READABLE MEDIUM, AND ...</td>\n </tr>\n <tr>\n <th>64261</th>\n <td>575553975</td>\n <td>en</td>\n <td>MULTISPECIFIC ANTIGEN BINDING PROTEINS</td>\n </tr>\n <tr>\n <th>64262</th>\n <td>575556091</td>\n <td>en</td>\n <td>SYSTEM AND METHOD FOR METHANE HYDRATE BASED PR...</td>\n </tr>\n </tbody>\n</table>\n<p>64263 rows × 3 columns</p>\n</div>"
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"appln_title"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 7,
"outputs": [
{
"data": {
"text/plain": " appln_id appln_auth appln_nr appln_kind appln_filing_date \n0 330225325 EP 11150195 A 2011-01-05 \\\n1 330322632 EP 11150485 A 2011-01-10 \n2 330350961 EP 11150683 A 2011-01-12 \n3 330374780 WO 2011050339 W 2011-01-12 \n4 330424360 WO 2011050199 W 2011-01-10 \n\n appln_filing_year appln_nr_original ipr_type receiving_office \n0 2011 11150195 PI \\\n1 2011 11150485 PI \n2 2011 11150683 PI \n3 2011 EP2011/050339 PI EP \n4 2011 EP2011/050199 PI EP \n\n internat_appln_id ... earliest_pat_publn_id granted docdb_family_id \n0 0 ... 335277427 Y 43754737 \\\n1 0 ... 364719889 Y 43991052 \n2 0 ... 364923578 N 43881056 \n3 0 ... 335927718 N 43923624 \n4 0 ... 365345607 N 43533009 \n\n inpadoc_family_id docdb_family_size nb_citing_docdb_fam nb_applicants \n0 330225325 4 16 1 \\\n1 330322632 2 5 1 \n2 330350961 7 12 2 \n3 330374780 2 8 5 \n4 330424360 4 13 3 \n\n nb_inventors appln_title_lg \n0 1 en \\\n1 2 en \n2 5 en \n3 4 en \n4 2 en \n\n appln_title \n0 Beverage preparation machine \n1 Method and system for recommending contextual ... \n2 A method and an apparatus for treating at leas... \n3 A METHOD FOR DIAGNOSIS OF FAULT IN VEHICULAR W... \n4 ERROR CONTROL IN A COMMUNICATION SYSTEM \n\n[5 rows x 28 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>appln_id</th>\n <th>appln_auth</th>\n <th>appln_nr</th>\n <th>appln_kind</th>\n <th>appln_filing_date</th>\n <th>appln_filing_year</th>\n <th>appln_nr_original</th>\n <th>ipr_type</th>\n <th>receiving_office</th>\n <th>internat_appln_id</th>\n <th>...</th>\n <th>earliest_pat_publn_id</th>\n <th>granted</th>\n <th>docdb_family_id</th>\n <th>inpadoc_family_id</th>\n <th>docdb_family_size</th>\n <th>nb_citing_docdb_fam</th>\n <th>nb_applicants</th>\n <th>nb_inventors</th>\n <th>appln_title_lg</th>\n <th>appln_title</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>330225325</td>\n <td>EP</td>\n <td>11150195</td>\n <td>A</td>\n <td>2011-01-05</td>\n <td>2011</td>\n <td>11150195</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>335277427</td>\n <td>Y</td>\n <td>43754737</td>\n <td>330225325</td>\n <td>4</td>\n <td>16</td>\n <td>1</td>\n <td>1</td>\n <td>en</td>\n <td>Beverage preparation machine</td>\n </tr>\n <tr>\n <th>1</th>\n <td>330322632</td>\n <td>EP</td>\n <td>11150485</td>\n <td>A</td>\n <td>2011-01-10</td>\n <td>2011</td>\n <td>11150485</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>364719889</td>\n <td>Y</td>\n <td>43991052</td>\n <td>330322632</td>\n <td>2</td>\n <td>5</td>\n <td>1</td>\n <td>2</td>\n <td>en</td>\n <td>Method and system for recommending contextual ...</td>\n </tr>\n <tr>\n <th>2</th>\n <td>330350961</td>\n <td>EP</td>\n <td>11150683</td>\n <td>A</td>\n <td>2011-01-12</td>\n <td>2011</td>\n <td>11150683</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>364923578</td>\n <td>N</td>\n <td>43881056</td>\n <td>330350961</td>\n <td>7</td>\n <td>12</td>\n <td>2</td>\n <td>5</td>\n <td>en</td>\n <td>A method and an apparatus for treating at leas...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>330374780</td>\n <td>WO</td>\n <td>2011050339</td>\n <td>W</td>\n <td>2011-01-12</td>\n <td>2011</td>\n <td>EP2011/050339</td>\n <td>PI</td>\n <td>EP</td>\n <td>0</td>\n <td>...</td>\n <td>335927718</td>\n <td>N</td>\n <td>43923624</td>\n <td>330374780</td>\n <td>2</td>\n <td>8</td>\n <td>5</td>\n <td>4</td>\n <td>en</td>\n <td>A METHOD FOR DIAGNOSIS OF FAULT IN VEHICULAR W...</td>\n </tr>\n <tr>\n <th>4</th>\n <td>330424360</td>\n <td>WO</td>\n <td>2011050199</td>\n <td>W</td>\n <td>2011-01-10</td>\n <td>2011</td>\n <td>EP2011/050199</td>\n <td>PI</td>\n <td>EP</td>\n <td>0</td>\n <td>...</td>\n <td>365345607</td>\n <td>N</td>\n <td>43533009</td>\n <td>330424360</td>\n <td>4</td>\n <td>13</td>\n <td>3</td>\n <td>2</td>\n <td>en</td>\n <td>ERROR CONTROL IN A COMMUNICATION SYSTEM</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 28 columns</p>\n</div>"
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"appln_data = appln.merge(appln_title, on=\"appln_id\")\n",
"appln_data.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 10,
"outputs": [
{
"data": {
"text/plain": "array(['EP', 'WO', 'LU', 'FI', 'FR', 'ES', 'NO', 'US', 'GB', 'DO', 'DE',\n 'CA', 'UY', 'SV', 'KR', 'TR', 'CR', 'TW', 'NL', 'SG', 'CO', 'DK',\n 'CU', 'HR', 'AR', 'RU', 'AU', 'PL', 'BE', 'BR', 'MX', 'AP', 'MC',\n 'EC', 'PE', 'HU', 'EA', 'AT', 'RO', 'PT', 'CZ', 'IS', 'HN', 'MA',\n 'MD', 'CN', 'GT', 'UA', 'CL', 'SK', 'PH', 'MY', 'SI', 'HK', 'RS',\n 'IN', 'VN', 'TN', 'IL', 'GE', 'CY', 'SM', 'ZA', 'SE', 'CH', 'LT',\n 'ME', 'JO', 'NI', 'JP', 'SA', 'LV'], dtype=object)"
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"appln_data[\"appln_auth\"].unique()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 8,
"outputs": [
{
"data": {
"text/plain": " person_id appln_id applt_seq_nr invt_seq_nr\n0 1 340314532 1 0\n1 1 413601768 1 0\n2 21 332015605 1 0\n3 21 333490084 1 0\n4 21 335903805 1 0\n... ... ... ... ...\n274039 85719932 545918634 0 2\n274040 85720336 569409547 0 4\n274041 85720376 555215896 0 2\n274042 85720469 569304088 0 5\n274043 85720500 569495993 0 5\n\n[274044 rows x 4 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>person_id</th>\n <th>appln_id</th>\n <th>applt_seq_nr</th>\n <th>invt_seq_nr</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>340314532</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1</td>\n <td>413601768</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>21</td>\n <td>332015605</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>21</td>\n <td>333490084</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>21</td>\n <td>335903805</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>274039</th>\n <td>85719932</td>\n <td>545918634</td>\n <td>0</td>\n <td>2</td>\n </tr>\n <tr>\n <th>274040</th>\n <td>85720336</td>\n <td>569409547</td>\n <td>0</td>\n <td>4</td>\n </tr>\n <tr>\n <th>274041</th>\n <td>85720376</td>\n <td>555215896</td>\n <td>0</td>\n <td>2</td>\n </tr>\n <tr>\n <th>274042</th>\n <td>85720469</td>\n <td>569304088</td>\n <td>0</td>\n <td>5</td>\n </tr>\n <tr>\n <th>274043</th>\n <td>85720500</td>\n <td>569495993</td>\n <td>0</td>\n <td>5</td>\n </tr>\n </tbody>\n</table>\n<p>274044 rows × 4 columns</p>\n</div>"
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"appln_pers"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 37,
"outputs": [
{
"data": {
"text/plain": " person_id person_name person_name_orig_lg \n0 1 Nokia Corporation Nokia Corporation \\\n1 128 Nokia Siemens Networks Oy Nokia Siemens Networks Oy \n2 5217785 Nokia Corporation Nokia Corporation \n3 5217811 Nokia Corporation Nokia Corporation \n4 5232170 Nokia Siemens Networks Oy Nokia Siemens Networks Oy \n... ... ... ... \n112235 85719932 VIKSTREM, Erik ВИКСТРЁМ, Эрик \n112236 85720336 HWANG, LING-CHI HWANG, LING-CHI \n112237 85720376 LI, I Chan LI, I Chan \n112238 85720469 TING, Chia Ching TING, Chia Ching \n112239 85720500 WANG, YU-CHEIH WANG, YU-CHEIH \n\n person_address person_ctry_code nuts nuts_level \n0 Keilalahdentie 4,02150 Espoo FI FI1B1 3 \\\n1 Karaportti 3,02610 Espoo FI FI1B1 3 \n2 Espoo FI FI 0 \n3 NaN FI FI 0 \n4 Espoo FI FI 0 \n... ... ... ... ... \n112235 NaN SE SE 0 \n112236 NaN TW NaN 9 \n112237 NaN TW NaN 9 \n112238 TW TW NaN 9 \n112239 NaN TW NaN 9 \n\n doc_std_name_id doc_std_name psn_id \n0 1 NOKIA CORP 23782051 \\\n1 112 NOKIA SIEMENS NETWORKS OY 23782129 \n2 1 NOKIA CORP 23782051 \n3 1 NOKIA CORP 23782051 \n4 112 NOKIA SIEMENS NETWORKS OY 23782129 \n... ... ... ... \n112235 38919340 VIKSTREM ERIK 185719932 \n112236 35599384 HWANG LING-CHI 185720336 \n112237 38707281 LI I CHAN 185720376 \n112238 23937900 TING CHIA CHING 185720469 \n112239 38204835 WANG YU-CHEIH 185720500 \n\n psn_name psn_level psn_sector han_id han_name \n0 NOKIA CORPORATION 2 COMPANY 2125445 NOKIA CORP \\\n1 NOKIA NETWORKS 2 COMPANY 2125445 NOKIA CORP \n2 NOKIA CORPORATION 2 COMPANY 2125445 NOKIA CORP \n3 NOKIA CORPORATION 2 COMPANY 2125445 NOKIA CORP \n4 NOKIA NETWORKS 2 COMPANY 2125445 NOKIA CORP \n... ... ... ... ... ... \n112235 VIKSTREM, Erik 0 UNKNOWN 185719932 VIKSTREM, Erik \n112236 HWANG, LING-CHI 0 UNKNOWN 185720336 HWANG, LING-CHI \n112237 LI, I Chan 0 UNKNOWN 185720376 LI, I Chan \n112238 TING, Chia Ching 0 UNKNOWN 185720469 TING, Chia Ching \n112239 WANG, YU-CHEIH 0 UNKNOWN 185720500 WANG, YU-CHEIH \n\n han_harmonized psn_sector_primary \n0 2 COMPANY \n1 2 COMPANY \n2 2 COMPANY \n3 2 COMPANY \n4 2 COMPANY \n... ... ... \n112235 0 UNKNOWN \n112236 0 UNKNOWN \n112237 0 UNKNOWN \n
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>person_id</th>\n <th>person_name</th>\n <th>person_name_orig_lg</th>\n <th>person_address</th>\n <th>person_ctry_code</th>\n <th>nuts</th>\n <th>nuts_level</th>\n <th>doc_std_name_id</th>\n <th>doc_std_name</th>\n <th>psn_id</th>\n <th>psn_name</th>\n <th>psn_level</th>\n <th>psn_sector</th>\n <th>han_id</th>\n <th>han_name</th>\n <th>han_harmonized</th>\n <th>psn_sector_primary</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>Nokia Corporation</td>\n <td>Nokia Corporation</td>\n <td>Keilalahdentie 4,02150 Espoo</td>\n <td>FI</td>\n <td>FI1B1</td>\n <td>3</td>\n <td>1</td>\n <td>NOKIA CORP</td>\n <td>23782051</td>\n <td>NOKIA CORPORATION</td>\n <td>2</td>\n <td>COMPANY</td>\n <td>2125445</td>\n <td>NOKIA CORP</td>\n <td>2</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>1</th>\n <td>128</td>\n <td>Nokia Siemens Networks Oy</td>\n <td>Nokia Siemens Networks Oy</td>\n <td>Karaportti 3,02610 Espoo</td>\n <td>FI</td>\n <td>FI1B1</td>\n <td>3</td>\n <td>112</td>\n <td>NOKIA SIEMENS NETWORKS OY</td>\n <td>23782129</td>\n <td>NOKIA NETWORKS</td>\n <td>2</td>\n <td>COMPANY</td>\n <td>2125445</td>\n <td>NOKIA CORP</td>\n <td>2</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>2</th>\n <td>5217785</td>\n <td>Nokia Corporation</td>\n <td>Nokia Corporation</td>\n <td>Espoo</td>\n <td>FI</td>\n <td>FI</td>\n <td>0</td>\n <td>1</td>\n <td>NOKIA CORP</td>\n <td>23782051</td>\n <td>NOKIA CORPORATION</td>\n <td>2</td>\n <td>COMPANY</td>\n <td>2125445</td>\n <td>NOKIA CORP</td>\n <td>2</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>3</th>\n <td>5217811</td>\n <td>Nokia Corporation</td>\n <td>Nokia Corporation</td>\n <td>NaN</td>\n <td>FI</td>\n <td>FI</td>\n <td>0</td>\n <td>1</td>\n <td>NOKIA CORP</td>\n <td>23782051</td>\n <td>NOKIA CORPORATION</td>\n <td>2</td>\n <td>COMPANY</td>\n <td>2125445</td>\n <td>NOKIA CORP</td>\n <td>2</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>4</th>\n <td>5232170</td>\n <td>Nokia Siemens Networks Oy</td>\n <td>Nokia Siemens Networks Oy</td>\n <td>Espoo</td>\n <td>FI</td>\n <td>FI</td>\n <td>0</td>\n <td>112</td>\n <td>NOKIA SIEMENS NETWORKS OY</td>\n <td>23782129</td>\n <td>NOKIA NETWORKS</td>\n <td>2</td>\n <td>COMPANY</td>\n <td>2125445</td>\n <td>NOKIA CORP</td>\n <td>2</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>112235</th>\n <td>85719932</td>\n <td>VIKSTREM, Erik</td>\n <td>ВИКСТРЁМ, Эрик</td>\n <td>NaN</td>\n <td>SE</td>\n <td>SE</td>\n <td>0</td>\n <td>38919340</td>\n <td>VIKSTREM ERIK</td>\n <td>185719932</td>\n <td>VIKSTREM, Erik</td>\n <td>0</td>\n <td>UNKNOWN</td>\n <td>185719932</td>\n <td>VIKSTREM, Erik</td>\n <td>0</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pers_sector_primary = pers.groupby(\"han_id\", as_index=False)[\"psn_sector\"].agg(\n",
" lambda x: pd.Series.mode(x)[0]).rename(columns={\"psn_sector\":\"psn_sector_primary\"})\n",
"persn = pers.merge(pers_sector_primary, on='han_id')\n",
"persn"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 38,
"outputs": [
{
"data": {
"text/plain": " han_id psn_sector_primary\n0 264 GOV NON-PROFIT UNIVERSITY\n1 627 COMPANY\n2 974 COMPANY\n3 1480 COMPANY\n4 1699 COMPANY\n... ... ...\n106154 185719932 UNKNOWN\n106155 185720336 UNKNOWN\n106156 185720376 UNKNOWN\n106157 185720469 UNKNOWN\n106158 185720500 UNKNOWN\n\n[106159 rows x 2 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>han_id</th>\n <th>psn_sector_primary</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>264</td>\n <td>GOV NON-PROFIT UNIVERSITY</td>\n </tr>\n <tr>\n <th>1</th>\n <td>627</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>2</th>\n <td>974</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>3</th>\n <td>1480</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>4</th>\n <td>1699</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>106154</th>\n <td>185719932</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>106155</th>\n <td>185720336</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>106156</th>\n <td>185720376</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>106157</th>\n <td>185720469</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>106158</th>\n <td>185720500</td>\n <td>UNKNOWN</td>\n </tr>\n </tbody>\n</table>\n<p>106159 rows × 2 columns</p>\n</div>"
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pers_sector_primary"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 20,
"outputs": [],
"source": [
"appln_merge = appln.merge(appln_title, on=\"appln_id\")#.merge(appln_pers,on=\"appln_id\")\n",
"appln_merge.to_excel(\"appln_data.xlsx\", index=False)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 21,
"outputs": [],
"source": [
"person_merge = appln_pers.merge(pers,on=\"person_id\")\n",
"person_merge.to_excel(\"person_data.xlsx\", index=False)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 18,
"outputs": [
{
"data": {
"text/plain": "array(['FI', 'NL', 'FR', 'DE', 'DK', 'AT', 'SE', 'BE', 'TW', 'LU', 'CN',\n 'IT', 'HU', 'IE', 'SI', 'CZ', 'ES', 'HK', 'PL', 'CY', 'SK', 'PT',\n 'LT', 'EE', 'MT', 'GR', 'RO', 'BG', 'HR', 'MO', 'LV'], dtype=object)"
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pers[\"person_ctry_code\"].unique()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 16,
"outputs": [
{
"ename": "KeyError",
"evalue": "'cry_code'",
"output_type": "error",
"traceback": [
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[1;31mKeyError\u001B[0m Traceback (most recent call last)",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\pandas\\core\\indexes\\base.py:3649\u001B[0m, in \u001B[0;36mIndex.get_loc\u001B[1;34m(self, key)\u001B[0m\n\u001B[0;32m 3648\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[1;32m-> 3649\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_engine\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_loc\u001B[49m\u001B[43m(\u001B[49m\u001B[43mcasted_key\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 3650\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m err:\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\pandas\\_libs\\index.pyx:147\u001B[0m, in \u001B[0;36mpandas._libs.index.IndexEngine.get_loc\u001B[1;34m()\u001B[0m\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\pandas\\_libs\\index.pyx:176\u001B[0m, in \u001B[0;36mpandas._libs.index.IndexEngine.get_loc\u001B[1;34m()\u001B[0m\n",
"File \u001B[1;32mpandas\\_libs\\hashtable_class_helper.pxi:7080\u001B[0m, in \u001B[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001B[1;34m()\u001B[0m\n",
"File \u001B[1;32mpandas\\_libs\\hashtable_class_helper.pxi:7088\u001B[0m, in \u001B[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001B[1;34m()\u001B[0m\n",
"\u001B[1;31mKeyError\u001B[0m: 'cry_code'",
"\nThe above exception was the direct cause of the following exception:\n",
"\u001B[1;31mKeyError\u001B[0m Traceback (most recent call last)",
"Cell \u001B[1;32mIn[16], line 1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m \u001B[43mperson_merge\u001B[49m\u001B[43m[\u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mcry_code\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m]\u001B[49m\u001B[38;5;241m.\u001B[39munique()\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\pandas\\core\\frame.py:3745\u001B[0m, in \u001B[0;36mDataFrame.__getitem__\u001B[1;34m(self, key)\u001B[0m\n\u001B[0;32m 3743\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcolumns\u001B[38;5;241m.\u001B[39mnlevels \u001B[38;5;241m>\u001B[39m \u001B[38;5;241m1\u001B[39m:\n\u001B[0;32m 3744\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_getitem_multilevel(key)\n\u001B[1;32m-> 3745\u001B[0m indexer \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcolumns\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_loc\u001B[49m\u001B[43m(\u001B[49m\u001B[43mkey\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 3746\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m is_integer(indexer):\n\u001B[0;32m 3747\u001B[0m indexer \u001B[38;5;241m=\u001B[39m [indexer]\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\pandas\\core\\indexes\\base.py:3651\u001B[0m, in \u001B[0;36mIndex.get_loc\u001B[1;34m(self, key)\u001B[0m\n\u001B[0;32m 3649\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_engine\u001B[38;5;241m.\u001B[39mget_loc(casted_key)\n\u001B[0;32m 3650\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m err:\n\u001B[1;32m-> 3651\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m(key) \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01merr\u001B[39;00m\n\u001B[0;32m 3652\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mTypeError\u001B[39;00m:\n\u001B[0;32m 3653\u001B[0m \u001B[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001B[39;00m\n\u001B[0;32m 3654\u001B[0m \u001B[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001B[39;00m\n\u001B[0;32m 3655\u001B[0m \u001B[38;5;66;03m# the TypeError.\u001B[39;00m\n\u001B[0;32m 3656\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_check_indexing_error(key)\n",
"\u001B[1;31mKeyError\u001B[0m: 'cry_code'"
]
}
],
"source": [
"person_merge[\"cry_code\"].unique()"
],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}