You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ZSI_Reconnect_China/PATSTAT/patstat_analysis_pipeline.i...

292 lines
34 KiB
Plaintext

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import os\n",
"import pandas as pd\n",
"import janitor\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from matplotlib.ticker import MaxNLocator\n",
"import math\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 3,
"outputs": [],
"source": [
"outdir=\"WESTERN_CH_scope\"\n",
"\n",
"appln = pd.read_csv(f\"{outdir}/tls_201_scope.csv\")\n",
"\n",
"appln_title = pd.read_csv(f\"{outdir}/tls_202_scope.csv\")\n",
"\n",
"pers = pd.read_csv(f\"{outdir}/tls_206_scope.csv\")\n",
"pers['psn_sector'] = pers['psn_sector'].fillna(\"UNKNOWN\")\n",
"\n",
"appln_pers = pd.read_csv(f\"{outdir}/tls_207_scope.csv\")\n",
"\n",
"appln_cpc = pd.read_csv(f\"{outdir}/tls_224_scope.csv\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 5,
"outputs": [
{
"data": {
"text/plain": "203873"
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(appln)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 7,
"outputs": [
{
"data": {
"text/plain": " appln_id appln_title_lg \n106316 498640253 en \\\n119852 511974583 en \n193586 577006640 en \n172207 556318748 en \n117620 509549284 en \n... ... ... \n58791 448189845 en \n119362 511604550 en \n73722 471815906 en \n45133 438311946 en \n25978 414431520 en \n\n appln_title \n106316 DRAIN CLEANING DEVICE \n119852 Antenna panel switching and beam indication \n193586 Loft bed \n172207 Winch for Securing a Load \n117620 TEMPERATURE CONTROL APPARATUS FORELECTRIC VEHI... \n... ... \n58791 Collaborative spectrum sensing in cognitive ra... \n119362 CLIP-ON GLASSES WITH REPLACEABLE LENS \n73722 Sensitized, photo-sensitive glass and its prod... \n45133 PREPARATION OF 3,4-DIHYDRO-1,4-BENZOXAZEPIN-5(... \n25978 - CRYSTAL OSCILLATOR WITH LOW-POWER MODE \n\n[100 rows x 3 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>appln_id</th>\n <th>appln_title_lg</th>\n <th>appln_title</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>106316</th>\n <td>498640253</td>\n <td>en</td>\n <td>DRAIN CLEANING DEVICE</td>\n </tr>\n <tr>\n <th>119852</th>\n <td>511974583</td>\n <td>en</td>\n <td>Antenna panel switching and beam indication</td>\n </tr>\n <tr>\n <th>193586</th>\n <td>577006640</td>\n <td>en</td>\n <td>Loft bed</td>\n </tr>\n <tr>\n <th>172207</th>\n <td>556318748</td>\n <td>en</td>\n <td>Winch for Securing a Load</td>\n </tr>\n <tr>\n <th>117620</th>\n <td>509549284</td>\n <td>en</td>\n <td>TEMPERATURE CONTROL APPARATUS FORELECTRIC VEHI...</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>58791</th>\n <td>448189845</td>\n <td>en</td>\n <td>Collaborative spectrum sensing in cognitive ra...</td>\n </tr>\n <tr>\n <th>119362</th>\n <td>511604550</td>\n <td>en</td>\n <td>CLIP-ON GLASSES WITH REPLACEABLE LENS</td>\n </tr>\n <tr>\n <th>73722</th>\n <td>471815906</td>\n <td>en</td>\n <td>Sensitized, photo-sensitive glass and its prod...</td>\n </tr>\n <tr>\n <th>45133</th>\n <td>438311946</td>\n <td>en</td>\n <td>PREPARATION OF 3,4-DIHYDRO-1,4-BENZOXAZEPIN-5(...</td>\n </tr>\n <tr>\n <th>25978</th>\n <td>414431520</td>\n <td>en</td>\n <td>- CRYSTAL OSCILLATOR WITH LOW-POWER MODE</td>\n </tr>\n </tbody>\n</table>\n<p>100 rows × 3 columns</p>\n</div>"
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"appln_title.sample(100)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 8,
"outputs": [
{
"data": {
"text/plain": " appln_id appln_auth appln_nr appln_kind appln_filing_date \n0 330225325 EP 11150195 A 2011-01-05 \\\n1 330225397 EP 11150231 A 2011-01-05 \n2 330322632 EP 11150485 A 2011-01-10 \n3 330326785 EP 11150605 A 2011-01-11 \n4 330350961 EP 11150683 A 2011-01-12 \n\n appln_filing_year appln_nr_original ipr_type receiving_office \n0 2011 11150195 PI \\\n1 2011 11150231 PI \n2 2011 11150485 PI \n3 2011 11150605 PI \n4 2011 11150683 PI \n\n internat_appln_id ... earliest_pat_publn_id granted docdb_family_id \n0 0 ... 335277427 Y 43754737 \\\n1 0 ... 335277736 Y 43619902 \n2 0 ... 364719889 Y 43991052 \n3 0 ... 335277720 N 43023665 \n4 0 ... 364923578 N 43881056 \n\n inpadoc_family_id docdb_family_size nb_citing_docdb_fam nb_applicants \n0 330225325 4 16 1 \\\n1 330225397 6 56 1 \n2 330322632 2 5 1 \n3 328518903 6 9 1 \n4 330350961 7 13 2 \n\n nb_inventors appln_title_lg \n0 1 en \\\n1 9 en \n2 2 en \n3 3 en \n4 5 en \n\n appln_title \n0 Beverage preparation machine \n1 Screwdriving tool having a driving tool with a... \n2 Method and system for recommending contextual ... \n3 Apparatus and method for continuous casting of... \n4 A method and an apparatus for treating at leas... \n\n[5 rows x 28 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>appln_id</th>\n <th>appln_auth</th>\n <th>appln_nr</th>\n <th>appln_kind</th>\n <th>appln_filing_date</th>\n <th>appln_filing_year</th>\n <th>appln_nr_original</th>\n <th>ipr_type</th>\n <th>receiving_office</th>\n <th>internat_appln_id</th>\n <th>...</th>\n <th>earliest_pat_publn_id</th>\n <th>granted</th>\n <th>docdb_family_id</th>\n <th>inpadoc_family_id</th>\n <th>docdb_family_size</th>\n <th>nb_citing_docdb_fam</th>\n <th>nb_applicants</th>\n <th>nb_inventors</th>\n <th>appln_title_lg</th>\n <th>appln_title</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>330225325</td>\n <td>EP</td>\n <td>11150195</td>\n <td>A</td>\n <td>2011-01-05</td>\n <td>2011</td>\n <td>11150195</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>335277427</td>\n <td>Y</td>\n <td>43754737</td>\n <td>330225325</td>\n <td>4</td>\n <td>16</td>\n <td>1</td>\n <td>1</td>\n <td>en</td>\n <td>Beverage preparation machine</td>\n </tr>\n <tr>\n <th>1</th>\n <td>330225397</td>\n <td>EP</td>\n <td>11150231</td>\n <td>A</td>\n <td>2011-01-05</td>\n <td>2011</td>\n <td>11150231</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>335277736</td>\n <td>Y</td>\n <td>43619902</td>\n <td>330225397</td>\n <td>6</td>\n <td>56</td>\n <td>1</td>\n <td>9</td>\n <td>en</td>\n <td>Screwdriving tool having a driving tool with a...</td>\n </tr>\n <tr>\n <th>2</th>\n <td>330322632</td>\n <td>EP</td>\n <td>11150485</td>\n <td>A</td>\n <td>2011-01-10</td>\n <td>2011</td>\n <td>11150485</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>364719889</td>\n <td>Y</td>\n <td>43991052</td>\n <td>330322632</td>\n <td>2</td>\n <td>5</td>\n <td>1</td>\n <td>2</td>\n <td>en</td>\n <td>Method and system for recommending contextual ...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>330326785</td>\n <td>EP</td>\n <td>11150605</td>\n <td>A</td>\n <td>2011-01-11</td>\n <td>2011</td>\n <td>11150605</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>335277720</td>\n <td>N</td>\n <td>43023665</td>\n <td>328518903</td>\n <td>6</td>\n <td>9</td>\n <td>1</td>\n <td>3</td>\n <td>en</td>\n <td>Apparatus and method for continuous casting of...</td>\n </tr>\n <tr>\n <th>4</th>\n <td>330350961</td>\n <td>EP</td>\n <td>11150683</td>\n <td>A</td>\n <td>2011-01-12</td>\n <td>2011</td>\n <td>11150683</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>364923578</td>\n <td>N</td>\n <td>43881056</td>\n <td>330350961</td>\n <td>7</td>\n <td>13</td>\n <td>2</td>\n <td>5</td>\n <td>en</td>\n <td>A method and an apparatus for treating at leas...</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 28 columns</p>\n</div>"
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"appln_data = appln.merge(appln_title, on=\"appln_id\")\n",
"appln_data.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 9,
"outputs": [
{
"data": {
"text/plain": "array(['EP', 'WO', 'LU', 'FI', 'NO', 'FR', 'GB', 'KR', 'ES', 'US', 'CA',\n 'DO', 'EC', 'DE', 'UY', 'IL', 'SV', 'PL', 'TR', 'CO', 'CR', 'TW',\n 'MA', 'PE', 'SG', 'CU', 'BE', 'DK', 'AR', 'AP', 'HR', 'MX', 'BR',\n 'EA', 'RU', 'AU', 'MC', 'HU', 'PT', 'NL', 'HN', 'AT', 'RO', 'SM',\n 'CH', 'SI', 'IS', 'CZ', 'HK', 'MD', 'JP', 'CN', 'RS', 'GT', 'UA',\n 'CL', 'SK', 'LT', 'PH', 'MY', 'IN', 'VN', 'TN', 'CY', 'GE', 'ZA',\n 'SE', 'ME', 'JO', 'NI', 'SA'], dtype=object)"
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"appln_data[\"appln_auth\"].unique()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 8,
"outputs": [
{
"data": {
"text/plain": " person_id appln_id applt_seq_nr invt_seq_nr\n0 1 340314532 1 0\n1 1 413601768 1 0\n2 21 332015605 1 0\n3 21 333490084 1 0\n4 21 335903805 1 0\n... ... ... ... ...\n274039 85719932 545918634 0 2\n274040 85720336 569409547 0 4\n274041 85720376 555215896 0 2\n274042 85720469 569304088 0 5\n274043 85720500 569495993 0 5\n\n[274044 rows x 4 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>person_id</th>\n <th>appln_id</th>\n <th>applt_seq_nr</th>\n <th>invt_seq_nr</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>340314532</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1</td>\n <td>413601768</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>21</td>\n <td>332015605</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>21</td>\n <td>333490084</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>21</td>\n <td>335903805</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>274039</th>\n <td>85719932</td>\n <td>545918634</td>\n <td>0</td>\n <td>2</td>\n </tr>\n <tr>\n <th>274040</th>\n <td>85720336</td>\n <td>569409547</td>\n <td>0</td>\n <td>4</td>\n </tr>\n <tr>\n <th>274041</th>\n <td>85720376</td>\n <td>555215896</td>\n <td>0</td>\n <td>2</td>\n </tr>\n <tr>\n <th>274042</th>\n <td>85720469</td>\n <td>569304088</td>\n <td>0</td>\n <td>5</td>\n </tr>\n <tr>\n <th>274043</th>\n <td>85720500</td>\n <td>569495993</td>\n <td>0</td>\n <td>5</td>\n </tr>\n </tbody>\n</table>\n<p>274044 rows × 4 columns</p>\n</div>"
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"appln_pers"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 37,
"outputs": [
{
"data": {
"text/plain": " person_id person_name person_name_orig_lg \n0 1 Nokia Corporation Nokia Corporation \\\n1 128 Nokia Siemens Networks Oy Nokia Siemens Networks Oy \n2 5217785 Nokia Corporation Nokia Corporation \n3 5217811 Nokia Corporation Nokia Corporation \n4 5232170 Nokia Siemens Networks Oy Nokia Siemens Networks Oy \n... ... ... ... \n112235 85719932 VIKSTREM, Erik ВИКСТРЁМ, Эрик \n112236 85720336 HWANG, LING-CHI HWANG, LING-CHI \n112237 85720376 LI, I Chan LI, I Chan \n112238 85720469 TING, Chia Ching TING, Chia Ching \n112239 85720500 WANG, YU-CHEIH WANG, YU-CHEIH \n\n person_address person_ctry_code nuts nuts_level \n0 Keilalahdentie 4,02150 Espoo FI FI1B1 3 \\\n1 Karaportti 3,02610 Espoo FI FI1B1 3 \n2 Espoo FI FI 0 \n3 NaN FI FI 0 \n4 Espoo FI FI 0 \n... ... ... ... ... \n112235 NaN SE SE 0 \n112236 NaN TW NaN 9 \n112237 NaN TW NaN 9 \n112238 TW TW NaN 9 \n112239 NaN TW NaN 9 \n\n doc_std_name_id doc_std_name psn_id \n0 1 NOKIA CORP 23782051 \\\n1 112 NOKIA SIEMENS NETWORKS OY 23782129 \n2 1 NOKIA CORP 23782051 \n3 1 NOKIA CORP 23782051 \n4 112 NOKIA SIEMENS NETWORKS OY 23782129 \n... ... ... ... \n112235 38919340 VIKSTREM ERIK 185719932 \n112236 35599384 HWANG LING-CHI 185720336 \n112237 38707281 LI I CHAN 185720376 \n112238 23937900 TING CHIA CHING 185720469 \n112239 38204835 WANG YU-CHEIH 185720500 \n\n psn_name psn_level psn_sector han_id han_name \n0 NOKIA CORPORATION 2 COMPANY 2125445 NOKIA CORP \\\n1 NOKIA NETWORKS 2 COMPANY 2125445 NOKIA CORP \n2 NOKIA CORPORATION 2 COMPANY 2125445 NOKIA CORP \n3 NOKIA CORPORATION 2 COMPANY 2125445 NOKIA CORP \n4 NOKIA NETWORKS 2 COMPANY 2125445 NOKIA CORP \n... ... ... ... ... ... \n112235 VIKSTREM, Erik 0 UNKNOWN 185719932 VIKSTREM, Erik \n112236 HWANG, LING-CHI 0 UNKNOWN 185720336 HWANG, LING-CHI \n112237 LI, I Chan 0 UNKNOWN 185720376 LI, I Chan \n112238 TING, Chia Ching 0 UNKNOWN 185720469 TING, Chia Ching \n112239 WANG, YU-CHEIH 0 UNKNOWN 185720500 WANG, YU-CHEIH \n\n han_harmonized psn_sector_primary \n0 2 COMPANY \n1 2 COMPANY \n2 2 COMPANY \n3 2 COMPANY \n4 2 COMPANY \n... ... ... \n112235 0 UNKNOWN \n112236 0 UNKNOWN \n112237 0 UNKNOWN \n112238 0 UNKNOWN \n112239 0 UNKNOWN \n\n[112240 rows x 17 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>person_id</th>\n <th>person_name</th>\n <th>person_name_orig_lg</th>\n <th>person_address</th>\n <th>person_ctry_code</th>\n <th>nuts</th>\n <th>nuts_level</th>\n <th>doc_std_name_id</th>\n <th>doc_std_name</th>\n <th>psn_id</th>\n <th>psn_name</th>\n <th>psn_level</th>\n <th>psn_sector</th>\n <th>han_id</th>\n <th>han_name</th>\n <th>han_harmonized</th>\n <th>psn_sector_primary</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>Nokia Corporation</td>\n <td>Nokia Corporation</td>\n <td>Keilalahdentie 4,02150 Espoo</td>\n <td>FI</td>\n <td>FI1B1</td>\n <td>3</td>\n <td>1</td>\n <td>NOKIA CORP</td>\n <td>23782051</td>\n <td>NOKIA CORPORATION</td>\n <td>2</td>\n <td>COMPANY</td>\n <td>2125445</td>\n <td>NOKIA CORP</td>\n <td>2</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>1</th>\n <td>128</td>\n <td>Nokia Siemens Networks Oy</td>\n <td>Nokia Siemens Networks Oy</td>\n <td>Karaportti 3,02610 Espoo</td>\n <td>FI</td>\n <td>FI1B1</td>\n <td>3</td>\n <td>112</td>\n <td>NOKIA SIEMENS NETWORKS OY</td>\n <td>23782129</td>\n <td>NOKIA NETWORKS</td>\n <td>2</td>\n <td>COMPANY</td>\n <td>2125445</td>\n <td>NOKIA CORP</td>\n <td>2</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>2</th>\n <td>5217785</td>\n <td>Nokia Corporation</td>\n <td>Nokia Corporation</td>\n <td>Espoo</td>\n <td>FI</td>\n <td>FI</td>\n <td>0</td>\n <td>1</td>\n <td>NOKIA CORP</td>\n <td>23782051</td>\n <td>NOKIA CORPORATION</td>\n <td>2</td>\n <td>COMPANY</td>\n <td>2125445</td>\n <td>NOKIA CORP</td>\n <td>2</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>3</th>\n <td>5217811</td>\n <td>Nokia Corporation</td>\n <td>Nokia Corporation</td>\n <td>NaN</td>\n <td>FI</td>\n <td>FI</td>\n <td>0</td>\n <td>1</td>\n <td>NOKIA CORP</td>\n <td>23782051</td>\n <td>NOKIA CORPORATION</td>\n <td>2</td>\n <td>COMPANY</td>\n <td>2125445</td>\n <td>NOKIA CORP</td>\n <td>2</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>4</th>\n <td>5232170</td>\n <td>Nokia Siemens Networks Oy</td>\n <td>Nokia Siemens Networks Oy</td>\n <td>Espoo</td>\n <td>FI</td>\n <td>FI</td>\n <td>0</td>\n <td>112</td>\n <td>NOKIA SIEMENS NETWORKS OY</td>\n <td>23782129</td>\n <td>NOKIA NETWORKS</td>\n <td>2</td>\n <td>COMPANY</td>\n <td>2125445</td>\n <td>NOKIA CORP</td>\n <td>2</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>112235</th>\n <td>85719932</td>\n <td>VIKSTREM, Erik</td>\n <td>ВИКСТРЁМ, Эрик</td>\n <td>NaN</td>\n <td>SE</td>\n <td>SE</td>\n <td>0</td>\n <td>38919340</td>\n <td>VIKSTREM ERIK</td>\n <td>185719932</td>\n <td>VIKSTREM, Erik</td>\n <td>0</td>\n <td>UNKNOWN</td>\n <td>185719932</td>\n <td>VIKSTREM, Erik</td>\n <td>0</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>112236</th>\n <td>85720336</td>\n <td>HWANG, LING-CHI</td>\n <td>HWANG, LING-CHI</td>\n <td>NaN</td>\n <td>TW</td>\n <td>NaN</td>\n <td>9</td>\n <td>35599384</td>\n <td>HWANG LING-CHI</td>\n <td>185720336</td>\n <td>HWANG, LING-CHI</td>\n <td>0</td>\n <td>UNKNOWN</td>\n <td>185720336</td>\n <td>HWANG, LING-CHI</td>\n <td>0</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>112237</th>\n <td>85720376</td>\n <td>LI, I Chan</td>\n <td>LI, I Chan</td>\n <td>NaN</td>\n <td>TW</td>\n <td>NaN</td>\n <td>9</td>\n <td>38707281</td>\n <td>LI I CHAN</td>\n <td>185720376</td>\n <td>LI, I Chan</td>\n <td>0</td>\n <td>UNKNOWN</td>\n <td>185720376</td>\n <td>LI, I Chan</td>\n <td>0</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>112238</th>\n <td>85720469</td>\n <td>TING, Chia Ching</td>\n <td>TING, Chia Ching</td>\n <td>TW</td>\n <td>TW</td>\n <td>NaN</td>\n <td>9</td>\n <td>23937900</td>\n <td>TING CHIA CHING</td>\n <td>185720469</td>\n <td>TING, Chia Ching</td>\n <td>0</td>\n <td>UNKNOWN</td>\n <td>185720469</td>\n <td>TING, Chia Ching</td>\n <td>0</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>112239</th>\n <td>85720500</td>\n <td>WANG, YU-CHEIH</td>\n <td>WANG, YU-CHEIH</td>\n <td>NaN</td>\n <td>TW</td>\n <td>NaN</td>\n <td>9</td>\n <td>38204835</td>\n <td>WANG YU-CHEIH</td>\n <td>185720500</td>\n <td>WANG, YU-CHEIH</td>\n <td>0</td>\n <td>UNKNOWN</td>\n <td>185720500</td>\n <td>WANG, YU-CHEIH</td>\n <td>0</td>\n <td>UNKNOWN</td>\n </tr>\n </tbody>\n</table>\n<p>112240 rows × 17 columns</p>\n</div>"
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pers_sector_primary = pers.groupby(\"han_id\", as_index=False)[\"psn_sector\"].agg(\n",
" lambda x: pd.Series.mode(x)[0]).rename(columns={\"psn_sector\":\"psn_sector_primary\"})\n",
"persn = pers.merge(pers_sector_primary, on='han_id')\n",
"persn"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 38,
"outputs": [
{
"data": {
"text/plain": " han_id psn_sector_primary\n0 264 GOV NON-PROFIT UNIVERSITY\n1 627 COMPANY\n2 974 COMPANY\n3 1480 COMPANY\n4 1699 COMPANY\n... ... ...\n106154 185719932 UNKNOWN\n106155 185720336 UNKNOWN\n106156 185720376 UNKNOWN\n106157 185720469 UNKNOWN\n106158 185720500 UNKNOWN\n\n[106159 rows x 2 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>han_id</th>\n <th>psn_sector_primary</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>264</td>\n <td>GOV NON-PROFIT UNIVERSITY</td>\n </tr>\n <tr>\n <th>1</th>\n <td>627</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>2</th>\n <td>974</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>3</th>\n <td>1480</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>4</th>\n <td>1699</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>106154</th>\n <td>185719932</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>106155</th>\n <td>185720336</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>106156</th>\n <td>185720376</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>106157</th>\n <td>185720469</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>106158</th>\n <td>185720500</td>\n <td>UNKNOWN</td>\n </tr>\n </tbody>\n</table>\n<p>106159 rows × 2 columns</p>\n</div>"
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pers_sector_primary"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 20,
"outputs": [],
"source": [
"appln_merge = appln.merge(appln_title, on=\"appln_id\")#.merge(appln_pers,on=\"appln_id\")\n",
"appln_merge.to_excel(\"appln_data.xlsx\", index=False)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 21,
"outputs": [],
"source": [
"person_merge = appln_pers.merge(pers,on=\"person_id\")\n",
"person_merge.to_excel(\"person_data.xlsx\", index=False)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 18,
"outputs": [
{
"data": {
"text/plain": "array(['FI', 'NL', 'FR', 'DE', 'DK', 'AT', 'SE', 'BE', 'TW', 'LU', 'CN',\n 'IT', 'HU', 'IE', 'SI', 'CZ', 'ES', 'HK', 'PL', 'CY', 'SK', 'PT',\n 'LT', 'EE', 'MT', 'GR', 'RO', 'BG', 'HR', 'MO', 'LV'], dtype=object)"
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pers[\"person_ctry_code\"].unique()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 16,
"outputs": [
{
"ename": "KeyError",
"evalue": "'cry_code'",
"output_type": "error",
"traceback": [
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[1;31mKeyError\u001B[0m Traceback (most recent call last)",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\pandas\\core\\indexes\\base.py:3649\u001B[0m, in \u001B[0;36mIndex.get_loc\u001B[1;34m(self, key)\u001B[0m\n\u001B[0;32m 3648\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[1;32m-> 3649\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_engine\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_loc\u001B[49m\u001B[43m(\u001B[49m\u001B[43mcasted_key\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 3650\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m err:\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\pandas\\_libs\\index.pyx:147\u001B[0m, in \u001B[0;36mpandas._libs.index.IndexEngine.get_loc\u001B[1;34m()\u001B[0m\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\pandas\\_libs\\index.pyx:176\u001B[0m, in \u001B[0;36mpandas._libs.index.IndexEngine.get_loc\u001B[1;34m()\u001B[0m\n",
"File \u001B[1;32mpandas\\_libs\\hashtable_class_helper.pxi:7080\u001B[0m, in \u001B[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001B[1;34m()\u001B[0m\n",
"File \u001B[1;32mpandas\\_libs\\hashtable_class_helper.pxi:7088\u001B[0m, in \u001B[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001B[1;34m()\u001B[0m\n",
"\u001B[1;31mKeyError\u001B[0m: 'cry_code'",
"\nThe above exception was the direct cause of the following exception:\n",
"\u001B[1;31mKeyError\u001B[0m Traceback (most recent call last)",
"Cell \u001B[1;32mIn[16], line 1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m \u001B[43mperson_merge\u001B[49m\u001B[43m[\u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mcry_code\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m]\u001B[49m\u001B[38;5;241m.\u001B[39munique()\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\pandas\\core\\frame.py:3745\u001B[0m, in \u001B[0;36mDataFrame.__getitem__\u001B[1;34m(self, key)\u001B[0m\n\u001B[0;32m 3743\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcolumns\u001B[38;5;241m.\u001B[39mnlevels \u001B[38;5;241m>\u001B[39m \u001B[38;5;241m1\u001B[39m:\n\u001B[0;32m 3744\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_getitem_multilevel(key)\n\u001B[1;32m-> 3745\u001B[0m indexer \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcolumns\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_loc\u001B[49m\u001B[43m(\u001B[49m\u001B[43mkey\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 3746\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m is_integer(indexer):\n\u001B[0;32m 3747\u001B[0m indexer \u001B[38;5;241m=\u001B[39m [indexer]\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\pandas\\core\\indexes\\base.py:3651\u001B[0m, in \u001B[0;36mIndex.get_loc\u001B[1;34m(self, key)\u001B[0m\n\u001B[0;32m 3649\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_engine\u001B[38;5;241m.\u001B[39mget_loc(casted_key)\n\u001B[0;32m 3650\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m err:\n\u001B[1;32m-> 3651\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m(key) \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01merr\u001B[39;00m\n\u001B[0;32m 3652\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mTypeError\u001B[39;00m:\n\u001B[0;32m 3653\u001B[0m \u001B[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001B[39;00m\n\u001B[0;32m 3654\u001B[0m \u001B[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001B[39;00m\n\u001B[0;32m 3655\u001B[0m \u001B[38;5;66;03m# the TypeError.\u001B[39;00m\n\u001B[0;32m 3656\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_check_indexing_error(key)\n",
"\u001B[1;31mKeyError\u001B[0m: 'cry_code'"
]
}
],
"source": [
"person_merge[\"cry_code\"].unique()"
],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}