You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ZSI_Reconnect_China/PATSTAT/patstat_analysis_pipeline.i...

286 lines
42 KiB
Plaintext

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import os\n",
"import pandas as pd\n",
"import janitor\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from matplotlib.ticker import MaxNLocator\n",
"import math\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 20,
"outputs": [],
"source": [
"outdir=\"WESTERN_CH_scope\"\n",
"\n",
"appln = pd.read_csv(f\"{outdir}/tls_201_scope.csv\")\n",
"\n",
"appln_title = pd.read_csv(f\"{outdir}/tls_202_scope.csv\")\n",
"\n",
"pers = pd.read_csv(f\"{outdir}/tls_206_scope.csv\")\n",
"pers['psn_sector'] = pers['psn_sector'].fillna(\"UNKNOWN\")\n",
"\n",
"appln_pers = pd.read_csv(f\"{outdir}/tls_207_scope.csv\")\n",
"\n",
"appln_cpc = pd.read_csv(f\"{outdir}/tls_224_scope.csv\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 21,
"outputs": [
{
"data": {
"text/plain": "203873"
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(appln)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 22,
"outputs": [
{
"data": {
"text/plain": " appln_id appln_title_lg \n6613 365740889 en \\\n15307 405955962 en \n34917 420680979 en \n65975 456943983 en \n31541 418199646 en \n... ... ... \n53388 444848074 en \n164003 549678226 en \n19193 409424261 en \n158766 545277468 en \n106813 498995405 en \n\n appln_title \n6613 CARD CONNECTOR \n15307 SERVO WRITE ASSEMBLY \n34917 CHILD SLEEPING APPARATUS WITH ADJUSTABLE SLEEP... \n65975 - NEAR-FIELD TRANSDUCER WITH RECESSED REGION \n31541 ELECTROSTATIC SPRAY TOOL SYSTEM \n... ... \n53388 Brassiere shoulder-strap closure \n164003 System and method for validating honest test t... \n19193 Hybrid wind turbine blade bearing \n158766 Video coding with successive codecs \n106813 IMAGE RECOGNITION METHOD AND APPARATUS \n\n[100 rows x 3 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>appln_id</th>\n <th>appln_title_lg</th>\n <th>appln_title</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>6613</th>\n <td>365740889</td>\n <td>en</td>\n <td>CARD CONNECTOR</td>\n </tr>\n <tr>\n <th>15307</th>\n <td>405955962</td>\n <td>en</td>\n <td>SERVO WRITE ASSEMBLY</td>\n </tr>\n <tr>\n <th>34917</th>\n <td>420680979</td>\n <td>en</td>\n <td>CHILD SLEEPING APPARATUS WITH ADJUSTABLE SLEEP...</td>\n </tr>\n <tr>\n <th>65975</th>\n <td>456943983</td>\n <td>en</td>\n <td>- NEAR-FIELD TRANSDUCER WITH RECESSED REGION</td>\n </tr>\n <tr>\n <th>31541</th>\n <td>418199646</td>\n <td>en</td>\n <td>ELECTROSTATIC SPRAY TOOL SYSTEM</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>53388</th>\n <td>444848074</td>\n <td>en</td>\n <td>Brassiere shoulder-strap closure</td>\n </tr>\n <tr>\n <th>164003</th>\n <td>549678226</td>\n <td>en</td>\n <td>System and method for validating honest test t...</td>\n </tr>\n <tr>\n <th>19193</th>\n <td>409424261</td>\n <td>en</td>\n <td>Hybrid wind turbine blade bearing</td>\n </tr>\n <tr>\n <th>158766</th>\n <td>545277468</td>\n <td>en</td>\n <td>Video coding with successive codecs</td>\n </tr>\n <tr>\n <th>106813</th>\n <td>498995405</td>\n <td>en</td>\n <td>IMAGE RECOGNITION METHOD AND APPARATUS</td>\n </tr>\n </tbody>\n</table>\n<p>100 rows × 3 columns</p>\n</div>"
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"appln_title.sample(100)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 23,
"outputs": [
{
"data": {
"text/plain": " appln_id appln_auth appln_nr appln_kind appln_filing_date \n0 330225325 EP 11150195 A 2011-01-05 \\\n1 330225397 EP 11150231 A 2011-01-05 \n2 330322632 EP 11150485 A 2011-01-10 \n3 330326785 EP 11150605 A 2011-01-11 \n4 330350961 EP 11150683 A 2011-01-12 \n\n appln_filing_year appln_nr_original ipr_type receiving_office \n0 2011 11150195 PI \\\n1 2011 11150231 PI \n2 2011 11150485 PI \n3 2011 11150605 PI \n4 2011 11150683 PI \n\n internat_appln_id ... earliest_pat_publn_id granted docdb_family_id \n0 0 ... 335277427 Y 43754737 \\\n1 0 ... 335277736 Y 43619902 \n2 0 ... 364719889 Y 43991052 \n3 0 ... 335277720 N 43023665 \n4 0 ... 364923578 N 43881056 \n\n inpadoc_family_id docdb_family_size nb_citing_docdb_fam nb_applicants \n0 330225325 4 16 1 \\\n1 330225397 6 56 1 \n2 330322632 2 5 1 \n3 328518903 6 9 1 \n4 330350961 7 13 2 \n\n nb_inventors appln_title_lg \n0 1 en \\\n1 9 en \n2 2 en \n3 3 en \n4 5 en \n\n appln_title \n0 Beverage preparation machine \n1 Screwdriving tool having a driving tool with a... \n2 Method and system for recommending contextual ... \n3 Apparatus and method for continuous casting of... \n4 A method and an apparatus for treating at leas... \n\n[5 rows x 28 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>appln_id</th>\n <th>appln_auth</th>\n <th>appln_nr</th>\n <th>appln_kind</th>\n <th>appln_filing_date</th>\n <th>appln_filing_year</th>\n <th>appln_nr_original</th>\n <th>ipr_type</th>\n <th>receiving_office</th>\n <th>internat_appln_id</th>\n <th>...</th>\n <th>earliest_pat_publn_id</th>\n <th>granted</th>\n <th>docdb_family_id</th>\n <th>inpadoc_family_id</th>\n <th>docdb_family_size</th>\n <th>nb_citing_docdb_fam</th>\n <th>nb_applicants</th>\n <th>nb_inventors</th>\n <th>appln_title_lg</th>\n <th>appln_title</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>330225325</td>\n <td>EP</td>\n <td>11150195</td>\n <td>A</td>\n <td>2011-01-05</td>\n <td>2011</td>\n <td>11150195</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>335277427</td>\n <td>Y</td>\n <td>43754737</td>\n <td>330225325</td>\n <td>4</td>\n <td>16</td>\n <td>1</td>\n <td>1</td>\n <td>en</td>\n <td>Beverage preparation machine</td>\n </tr>\n <tr>\n <th>1</th>\n <td>330225397</td>\n <td>EP</td>\n <td>11150231</td>\n <td>A</td>\n <td>2011-01-05</td>\n <td>2011</td>\n <td>11150231</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>335277736</td>\n <td>Y</td>\n <td>43619902</td>\n <td>330225397</td>\n <td>6</td>\n <td>56</td>\n <td>1</td>\n <td>9</td>\n <td>en</td>\n <td>Screwdriving tool having a driving tool with a...</td>\n </tr>\n <tr>\n <th>2</th>\n <td>330322632</td>\n <td>EP</td>\n <td>11150485</td>\n <td>A</td>\n <td>2011-01-10</td>\n <td>2011</td>\n <td>11150485</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>364719889</td>\n <td>Y</td>\n <td>43991052</td>\n <td>330322632</td>\n <td>2</td>\n <td>5</td>\n <td>1</td>\n <td>2</td>\n <td>en</td>\n <td>Method and system for recommending contextual ...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>330326785</td>\n <td>EP</td>\n <td>11150605</td>\n <td>A</td>\n <td>2011-01-11</td>\n <td>2011</td>\n <td>11150605</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>335277720</td>\n <td>N</td>\n <td>43023665</td>\n <td>328518903</td>\n <td>6</td>\n <td>9</td>\n <td>1</td>\n <td>3</td>\n <td>en</td>\n <td>Apparatus and method for continuous casting of...</td>\n </tr>\n <tr>\n <th>4</th>\n <td>330350961</td>\n <td>EP</td>\n <td>11150683</td>\n <td>A</td>\n <td>2011-01-12</td>\n <td>2011</td>\n <td>11150683</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>364923578</td>\n <td>N</td>\n <td>43881056</td>\n <td>330350961</td>\n <td>7</td>\n <td>13</td>\n <td>2</td>\n <td>5</td>\n <td>en</td>\n <td>A method and an apparatus for treating at leas...</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 28 columns</p>\n</div>"
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"appln_data = appln.merge(appln_title, on=\"appln_id\")\n",
"appln_data.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 24,
"outputs": [
{
"data": {
"text/plain": "array(['EP', 'WO', 'LU', 'FI', 'NO', 'FR', 'GB', 'KR', 'ES', 'US', 'CA',\n 'DO', 'EC', 'DE', 'UY', 'IL', 'SV', 'PL', 'TR', 'CO', 'CR', 'TW',\n 'MA', 'PE', 'SG', 'CU', 'BE', 'DK', 'AR', 'AP', 'HR', 'MX', 'BR',\n 'EA', 'RU', 'AU', 'MC', 'HU', 'PT', 'NL', 'HN', 'AT', 'RO', 'SM',\n 'CH', 'SI', 'IS', 'CZ', 'HK', 'MD', 'JP', 'CN', 'RS', 'GT', 'UA',\n 'CL', 'SK', 'LT', 'PH', 'MY', 'IN', 'VN', 'TN', 'CY', 'GE', 'ZA',\n 'SE', 'ME', 'JO', 'NI', 'SA'], dtype=object)"
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"appln_data[\"appln_auth\"].unique()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 25,
"outputs": [
{
"data": {
"text/plain": " person_id appln_id applt_seq_nr invt_seq_nr\n0 1 413601768 1 0\n1 21 332015605 1 0\n2 21 333490084 1 0\n3 21 335903805 1 0\n4 76 352908776 1 0\n... ... ... ... ...\n1025446 88836321 577982223 1 0\n1025447 88836333 583342135 0 4\n1025448 88836333 583342207 0 3\n1025449 88836333 585957705 0 5\n1025450 88836337 579601496 0 1\n\n[1025451 rows x 4 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>person_id</th>\n <th>appln_id</th>\n <th>applt_seq_nr</th>\n <th>invt_seq_nr</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>413601768</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>21</td>\n <td>332015605</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>21</td>\n <td>333490084</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>21</td>\n <td>335903805</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>76</td>\n <td>352908776</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>1025446</th>\n <td>88836321</td>\n <td>577982223</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1025447</th>\n <td>88836333</td>\n <td>583342135</td>\n <td>0</td>\n <td>4</td>\n </tr>\n <tr>\n <th>1025448</th>\n <td>88836333</td>\n <td>583342207</td>\n <td>0</td>\n <td>3</td>\n </tr>\n <tr>\n <th>1025449</th>\n <td>88836333</td>\n <td>585957705</td>\n <td>0</td>\n <td>5</td>\n </tr>\n <tr>\n <th>1025450</th>\n <td>88836337</td>\n <td>579601496</td>\n <td>0</td>\n <td>1</td>\n </tr>\n </tbody>\n</table>\n<p>1025451 rows × 4 columns</p>\n</div>"
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"appln_pers"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 26,
"outputs": [
{
"data": {
"text/plain": " person_id person_name person_name_orig_lg \n0 1 Nokia Corporation Nokia Corporation \\\n1 128 Nokia Siemens Networks Oy Nokia Siemens Networks Oy \n2 5217785 Nokia Corporation Nokia Corporation \n3 5217811 Nokia Corporation Nokia Corporation \n4 5232170 Nokia Siemens Networks Oy Nokia Siemens Networks Oy \n... ... ... ... \n354633 88836234 WONG, Chun Lok WONG, Chun Lok \n354634 88836257 XIAONING YE XIAONING YE \n354635 88836321 ZAI LAB (US) LLC ZAI LAB (US) LLC \n354636 88836333 ZHANG, Haocheng 张皓程 \n354637 88836337 ZHANG, Yangjun ZHANG, Yangjun \n\n person_address person_ctry_code nuts nuts_level \n0 Keilalahdentie 4,02150 Espoo FI FI1B1 3 \\\n1 Karaportti 3,02610 Espoo FI FI1B1 3 \n2 Espoo FI FI 0 \n3 NaN FI FI 0 \n4 Espoo FI FI 0 \n... ... ... ... ... \n354633 NaN US NaN 9 \n354634 Portland, Oregon US US NaN 9 \n354635 NaN US NaN 9 \n354636 NaN US NaN 9 \n354637 NaN US NaN 9 \n\n doc_std_name_id doc_std_name psn_id \n0 1 NOKIA CORP 23782051 \\\n1 112 NOKIA SIEMENS NETWORKS OY 23782129 \n2 1 NOKIA CORP 23782051 \n3 1 NOKIA CORP 23782051 \n4 112 NOKIA SIEMENS NETWORKS OY 23782129 \n... ... ... ... \n354633 30867225 WONG CHUN LOK 188836234 \n354634 8004293 XIAONING YE 188836257 \n354635 39363494 ZAI LAB US LLC 188836321 \n354636 7682590 ZHANG HAOCHENG 188836333 \n354637 2112344 ZHANG YANGJUN 188836337 \n\n psn_name psn_level psn_sector han_id han_name \n0 NOKIA CORPORATION 2 COMPANY 2125445 NOKIA CORP \\\n1 NOKIA NETWORKS 2 COMPANY 2125445 NOKIA CORP \n2 NOKIA CORPORATION 2 COMPANY 2125445 NOKIA CORP \n3 NOKIA CORPORATION 2 COMPANY 2125445 NOKIA CORP \n4 NOKIA NETWORKS 2 COMPANY 2125445 NOKIA CORP \n... ... ... ... ... ... \n354633 WONG, Chun Lok 0 UNKNOWN 188836234 WONG, Chun Lok \n354634 XIAONING YE 0 UNKNOWN 188836257 XIAONING YE \n354635 ZAI LAB (US) LLC 0 UNKNOWN 188836321 ZAI LAB (US) LLC \n354636 ZHANG, Haocheng 0 UNKNOWN 188836333 ZHANG, Haocheng \n354637 ZHANG, Yangjun 0 UNKNOWN 188836337 ZHANG, Yangjun \n\n han_harmonized psn_sector_primary \n0 2 COMPANY \n1 2 COMPANY \n2 2 COMPANY \n3 2 COMPANY \n4 2 COMPANY \n... ... ... \n354633 0 UNKNOWN \n354634 0 UNKNOWN \n354635 0 UNKNOWN \n354636 0 UNKNOWN \n354637 0 UNKNOWN \n\n[354638 rows x 17 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>person_id</th>\n <th>person_name</th>\n <th>person_name_orig_lg</th>\n <th>person_address</th>\n <th>person_ctry_code</th>\n <th>nuts</th>\n <th>nuts_level</th>\n <th>doc_std_name_id</th>\n <th>doc_std_name</th>\n <th>psn_id</th>\n <th>psn_name</th>\n <th>psn_level</th>\n <th>psn_sector</th>\n <th>han_id</th>\n <th>han_name</th>\n <th>han_harmonized</th>\n <th>psn_sector_primary</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>Nokia Corporation</td>\n <td>Nokia Corporation</td>\n <td>Keilalahdentie 4,02150 Espoo</td>\n <td>FI</td>\n <td>FI1B1</td>\n <td>3</td>\n <td>1</td>\n <td>NOKIA CORP</td>\n <td>23782051</td>\n <td>NOKIA CORPORATION</td>\n <td>2</td>\n <td>COMPANY</td>\n <td>2125445</td>\n <td>NOKIA CORP</td>\n <td>2</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>1</th>\n <td>128</td>\n <td>Nokia Siemens Networks Oy</td>\n <td>Nokia Siemens Networks Oy</td>\n <td>Karaportti 3,02610 Espoo</td>\n <td>FI</td>\n <td>FI1B1</td>\n <td>3</td>\n <td>112</td>\n <td>NOKIA SIEMENS NETWORKS OY</td>\n <td>23782129</td>\n <td>NOKIA NETWORKS</td>\n <td>2</td>\n <td>COMPANY</td>\n <td>2125445</td>\n <td>NOKIA CORP</td>\n <td>2</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>2</th>\n <td>5217785</td>\n <td>Nokia Corporation</td>\n <td>Nokia Corporation</td>\n <td>Espoo</td>\n <td>FI</td>\n <td>FI</td>\n <td>0</td>\n <td>1</td>\n <td>NOKIA CORP</td>\n <td>23782051</td>\n <td>NOKIA CORPORATION</td>\n <td>2</td>\n <td>COMPANY</td>\n <td>2125445</td>\n <td>NOKIA CORP</td>\n <td>2</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>3</th>\n <td>5217811</td>\n <td>Nokia Corporation</td>\n <td>Nokia Corporation</td>\n <td>NaN</td>\n <td>FI</td>\n <td>FI</td>\n <td>0</td>\n <td>1</td>\n <td>NOKIA CORP</td>\n <td>23782051</td>\n <td>NOKIA CORPORATION</td>\n <td>2</td>\n <td>COMPANY</td>\n <td>2125445</td>\n <td>NOKIA CORP</td>\n <td>2</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>4</th>\n <td>5232170</td>\n <td>Nokia Siemens Networks Oy</td>\n <td>Nokia Siemens Networks Oy</td>\n <td>Espoo</td>\n <td>FI</td>\n <td>FI</td>\n <td>0</td>\n <td>112</td>\n <td>NOKIA SIEMENS NETWORKS OY</td>\n <td>23782129</td>\n <td>NOKIA NETWORKS</td>\n <td>2</td>\n <td>COMPANY</td>\n <td>2125445</td>\n <td>NOKIA CORP</td>\n <td>2</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>354633</th>\n <td>88836234</td>\n <td>WONG, Chun Lok</td>\n <td>WONG, Chun Lok</td>\n <td>NaN</td>\n <td>US</td>\n <td>NaN</td>\n <td>9</td>\n <td>30867225</td>\n <td>WONG CHUN LOK</td>\n <td>188836234</td>\n <td>WONG, Chun Lok</td>\n <td>0</td>\n <td>UNKNOWN</td>\n <td>188836234</td>\n <td>WONG, Chun Lok</td>\n <td>0</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>354634</th>\n <td>88836257</td>\n <td>XIAONING YE</td>\n <td>XIAONING YE</td>\n <td>Portland, Oregon US</td>\n <td>US</td>\n <td>NaN</td>\n <td>9</td>\n <td>8004293</td>\n <td>XIAONING YE</td>\n <td>188836257</td>\n <td>XIAONING YE</td>\n <td>0</td>\n <td>UNKNOWN</td>\n <td>188836257</td>\n <td>XIAONING YE</td>\n <td>0</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>354635</th>\n <td>88836321</td>\n <td>ZAI LAB (US) LLC</td>\n <td>ZAI LAB (US) LLC</td>\n <td>NaN</td>\n <td>US</td>\n <td>NaN</td>\n <td>9</td>\n <td>39363494</td>\n <td>ZAI LAB US LLC</td>\n <td>188836321</td>\n <td>ZAI LAB (US) LLC</td>\n <td>0</td>\n <td>UNKNOWN</td>\n <td>188836321</td>\n <td>ZAI LAB (US) LLC</td>\n <td>0</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>354636</th>\n <td>88836333</td>\n <td>ZHANG, Haocheng</td>\n <td>张皓程</td>\n <td>NaN</td>\n <td>US</td>\n <td>NaN</td>\n <td>9</td>\n <td>7682590</td>\n <td>ZHANG HAOCHENG</td>\n <td>188836333</td>\n <td>ZHANG, Haocheng</td>\n <td>0</td>\n <td>UNKNOWN</td>\n <td>188836333</td>\n <td>ZHANG, Haocheng</td>\n <td>0</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>354637</th>\n <td>88836337</td>\n <td>ZHANG, Yangjun</td>\n <td>ZHANG, Yangjun</td>\n <td>NaN</td>\n <td>US</td>\n <td>NaN</td>\n <td>9</td>\n <td>2112344</td>\n <td>ZHANG YANGJUN</td>\n <td>188836337</td>\n <td>ZHANG, Yangjun</td>\n <td>0</td>\n <td>UNKNOWN</td>\n <td>188836337</td>\n <td>ZHANG, Yangjun</td>\n <td>0</td>\n <td>UNKNOWN</td>\n </tr>\n </tbody>\n</table>\n<p>354638 rows × 17 columns</p>\n</div>"
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": " person_id person_name person_name_orig_lg \n0 1 Nokia Corporation Nokia Corporation \\\n1 128 Nokia Siemens Networks Oy Nokia Siemens Networks Oy \n2 5217785 Nokia Corporation Nokia Corporation \n3 5217811 Nokia Corporation Nokia Corporation \n4 5232170 Nokia Siemens Networks Oy Nokia Siemens Networks Oy \n... ... ... ... \n354633 88836234 WONG, Chun Lok WONG, Chun Lok \n354634 88836257 XIAONING YE XIAONING YE \n354635 88836321 ZAI LAB (US) LLC ZAI LAB (US) LLC \n354636 88836333 ZHANG, Haocheng 张皓程 \n354637 88836337 ZHANG, Yangjun ZHANG, Yangjun \n\n person_address person_ctry_code nuts nuts_level \n0 Keilalahdentie 4,02150 Espoo FI FI1B1 3 \\\n1 Karaportti 3,02610 Espoo FI FI1B1 3 \n2 Espoo FI FI 0 \n3 NaN FI FI 0 \n4 Espoo FI FI 0 \n... ... ... ... ... \n354633 NaN US NaN 9 \n354634 Portland, Oregon US US NaN 9 \n354635 NaN US NaN 9 \n354636 NaN US NaN 9 \n354637 NaN US NaN 9 \n\n doc_std_name_id doc_std_name psn_id \n0 1 NOKIA CORP 23782051 \\\n1 112 NOKIA SIEMENS NETWORKS OY 23782129 \n2 1 NOKIA CORP 23782051 \n3 1 NOKIA CORP 23782051 \n4 112 NOKIA SIEMENS NETWORKS OY 23782129 \n... ... ... ... \n354633 30867225 WONG CHUN LOK 188836234 \n354634 8004293 XIAONING YE 188836257 \n354635 39363494 ZAI LAB US LLC 188836321 \n354636 7682590 ZHANG HAOCHENG 188836333 \n354637 2112344 ZHANG YANGJUN 188836337 \n\n psn_name psn_level psn_sector han_id han_name \n0 NOKIA CORPORATION 2 COMPANY 2125445 NOKIA CORP \\\n1 NOKIA NETWORKS 2 COMPANY 2125445 NOKIA CORP \n2 NOKIA CORPORATION 2 COMPANY 2125445 NOKIA CORP \n3 NOKIA CORPORATION 2 COMPANY 2125445 NOKIA CORP \n4 NOKIA NETWORKS 2 COMPANY 2125445 NOKIA CORP \n... ... ... ... ... ... \n354633 WONG, Chun Lok 0 UNKNOWN 188836234 WONG, Chun Lok \n354634 XIAONING YE 0 UNKNOWN 188836257 XIAONING YE \n354635 ZAI LAB (US) LLC 0 UNKNOWN 188836321 ZAI LAB (US) LLC \n354636 ZHANG, Haocheng 0 UNKNOWN 188836333 ZHANG, Haocheng \n354637 ZHANG, Yangjun 0 UNKNOWN 188836337 ZHANG, Yangjun \n\n han_harmonized psn_sector_primary \n0 2 COMPANY \n1 2 COMPANY \n2 2 COMPANY \n3 2 COMPANY \n4 2 COMPANY \n... ... ... \n354633 0 UNKNOWN \n354634 0 UNKNOWN \n354635 0 UNKNOWN \n354636 0 UNKNOWN \n354637 0 UNKNOWN \n\n[354638 rows x 17 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>person_id</th>\n <th>person_name</th>\n <th>person_name_orig_lg</th>\n <th>person_address</th>\n <th>person_ctry_code</th>\n <th>nuts</th>\n <th>nuts_level</th>\n <th>doc_std_name_id</th>\n <th>doc_std_name</th>\n <th>psn_id</th>\n <th>psn_name</th>\n <th>psn_level</th>\n <th>psn_sector</th>\n <th>han_id</th>\n <th>han_name</th>\n <th>han_harmonized</th>\n <th>psn_sector_primary</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>Nokia Corporation</td>\n <td>Nokia Corporation</td>\n <td>Keilalahdentie 4,02150 Espoo</td>\n <td>FI</td>\n <td>FI1B1</td>\n <td>3</td>\n <td>1</td>\n <td>NOKIA CORP</td>\n <td>23782051</td>\n <td>NOKIA CORPORATION</td>\n <td>2</td>\n <td>COMPANY</td>\n <td>2125445</td>\n <td>NOKIA CORP</td>\n <td>2</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>1</th>\n <td>128</td>\n <td>Nokia Siemens Networks Oy</td>\n <td>Nokia Siemens Networks Oy</td>\n <td>Karaportti 3,02610 Espoo</td>\n <td>FI</td>\n <td>FI1B1</td>\n <td>3</td>\n <td>112</td>\n <td>NOKIA SIEMENS NETWORKS OY</td>\n <td>23782129</td>\n <td>NOKIA NETWORKS</td>\n <td>2</td>\n <td>COMPANY</td>\n <td>2125445</td>\n <td>NOKIA CORP</td>\n <td>2</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>2</th>\n <td>5217785</td>\n <td>Nokia Corporation</td>\n <td>Nokia Corporation</td>\n <td>Espoo</td>\n <td>FI</td>\n <td>FI</td>\n <td>0</td>\n <td>1</td>\n <td>NOKIA CORP</td>\n <td>23782051</td>\n <td>NOKIA CORPORATION</td>\n <td>2</td>\n <td>COMPANY</td>\n <td>2125445</td>\n <td>NOKIA CORP</td>\n <td>2</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>3</th>\n <td>5217811</td>\n <td>Nokia Corporation</td>\n <td>Nokia Corporation</td>\n <td>NaN</td>\n <td>FI</td>\n <td>FI</td>\n <td>0</td>\n <td>1</td>\n <td>NOKIA CORP</td>\n <td>23782051</td>\n <td>NOKIA CORPORATION</td>\n <td>2</td>\n <td>COMPANY</td>\n <td>2125445</td>\n <td>NOKIA CORP</td>\n <td>2</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>4</th>\n <td>5232170</td>\n <td>Nokia Siemens Networks Oy</td>\n <td>Nokia Siemens Networks Oy</td>\n <td>Espoo</td>\n <td>FI</td>\n <td>FI</td>\n <td>0</td>\n <td>112</td>\n <td>NOKIA SIEMENS NETWORKS OY</td>\n <td>23782129</td>\n <td>NOKIA NETWORKS</td>\n <td>2</td>\n <td>COMPANY</td>\n <td>2125445</td>\n <td>NOKIA CORP</td>\n <td>2</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>354633</th>\n <td>88836234</td>\n <td>WONG, Chun Lok</td>\n <td>WONG, Chun Lok</td>\n <td>NaN</td>\n <td>US</td>\n <td>NaN</td>\n <td>9</td>\n <td>30867225</td>\n <td>WONG CHUN LOK</td>\n <td>188836234</td>\n <td>WONG, Chun Lok</td>\n <td>0</td>\n <td>UNKNOWN</td>\n <td>188836234</td>\n <td>WONG, Chun Lok</td>\n <td>0</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>354634</th>\n <td>88836257</td>\n <td>XIAONING YE</td>\n <td>XIAONING YE</td>\n <td>Portland, Oregon US</td>\n <td>US</td>\n <td>NaN</td>\n <td>9</td>\n <td>8004293</td>\n <td>XIAONING YE</td>\n <td>188836257</td>\n <td>XIAONING YE</td>\n <td>0</td>\n <td>UNKNOWN</td>\n <td>188836257</td>\n <td>XIAONING YE</td>\n <td>0</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>354635</th>\n <td>88836321</td>\n <td>ZAI LAB (US) LLC</td>\n <td>ZAI LAB (US) LLC</td>\n <td>NaN</td>\n <td>US</td>\n <td>NaN</td>\n <td>9</td>\n <td>39363494</td>\n <td>ZAI LAB US LLC</td>\n <td>188836321</td>\n <td>ZAI LAB (US) LLC</td>\n <td>0</td>\n <td>UNKNOWN</td>\n <td>188836321</td>\n <td>ZAI LAB (US) LLC</td>\n <td>0</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>354636</th>\n <td>88836333</td>\n <td>ZHANG, Haocheng</td>\n <td>张皓程</td>\n <td>NaN</td>\n <td>US</td>\n <td>NaN</td>\n <td>9</td>\n <td>7682590</td>\n <td>ZHANG HAOCHENG</td>\n <td>188836333</td>\n <td>ZHANG, Haocheng</td>\n <td>0</td>\n <td>UNKNOWN</td>\n <td>188836333</td>\n <td>ZHANG, Haocheng</td>\n <td>0</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>354637</th>\n <td>88836337</td>\n <td>ZHANG, Yangjun</td>\n <td>ZHANG, Yangjun</td>\n <td>NaN</td>\n <td>US</td>\n <td>NaN</td>\n <td>9</td>\n <td>2112344</td>\n <td>ZHANG YANGJUN</td>\n <td>188836337</td>\n <td>ZHANG, Yangjun</td>\n <td>0</td>\n <td>UNKNOWN</td>\n <td>188836337</td>\n <td>ZHANG, Yangjun</td>\n <td>0</td>\n <td>UNKNOWN</td>\n </tr>\n </tbody>\n</table>\n<p>354638 rows × 17 columns</p>\n</div>"
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pers_sector_primary = pers.groupby(\"han_id\", as_index=False)[\"psn_sector\"].agg(\n",
" lambda x: pd.Series.mode(x)[0]).rename(columns={\"psn_sector\":\"psn_sector_primary\"})\n",
"persn = pers.merge(pers_sector_primary, on='han_id')\n",
"persn"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 27,
"outputs": [
{
"data": {
"text/plain": " han_id psn_sector_primary\n0 32 COMPANY\n1 54 COMPANY\n2 83 COMPANY\n3 200 COMPANY\n4 264 GOV NON-PROFIT UNIVERSITY\n... ... ...\n335519 188836234 UNKNOWN\n335520 188836257 UNKNOWN\n335521 188836321 UNKNOWN\n335522 188836333 UNKNOWN\n335523 188836337 UNKNOWN\n\n[335524 rows x 2 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>han_id</th>\n <th>psn_sector_primary</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>32</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>1</th>\n <td>54</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>2</th>\n <td>83</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>3</th>\n <td>200</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>4</th>\n <td>264</td>\n <td>GOV NON-PROFIT UNIVERSITY</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>335519</th>\n <td>188836234</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>335520</th>\n <td>188836257</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>335521</th>\n <td>188836321</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>335522</th>\n <td>188836333</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>335523</th>\n <td>188836337</td>\n <td>UNKNOWN</td>\n </tr>\n </tbody>\n</table>\n<p>335524 rows × 2 columns</p>\n</div>"
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": " han_id psn_sector_primary\n0 32 COMPANY\n1 54 COMPANY\n2 83 COMPANY\n3 200 COMPANY\n4 264 GOV NON-PROFIT UNIVERSITY\n... ... ...\n335519 188836234 UNKNOWN\n335520 188836257 UNKNOWN\n335521 188836321 UNKNOWN\n335522 188836333 UNKNOWN\n335523 188836337 UNKNOWN\n\n[335524 rows x 2 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>han_id</th>\n <th>psn_sector_primary</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>32</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>1</th>\n <td>54</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>2</th>\n <td>83</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>3</th>\n <td>200</td>\n <td>COMPANY</td>\n </tr>\n <tr>\n <th>4</th>\n <td>264</td>\n <td>GOV NON-PROFIT UNIVERSITY</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>335519</th>\n <td>188836234</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>335520</th>\n <td>188836257</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>335521</th>\n <td>188836321</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>335522</th>\n <td>188836333</td>\n <td>UNKNOWN</td>\n </tr>\n <tr>\n <th>335523</th>\n <td>188836337</td>\n <td>UNKNOWN</td>\n </tr>\n </tbody>\n</table>\n<p>335524 rows × 2 columns</p>\n</div>"
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pers_sector_primary"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 27,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 28,
"outputs": [],
"source": [
"appln_merge = appln.merge(appln_title, on=\"appln_id\")#.merge(appln_pers,on=\"appln_id\")\n",
"appln_merge.to_excel(\"appln_data.xlsx\", index=False)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 29,
"outputs": [],
"source": [
"person_merge = appln_pers.merge(pers,on=\"person_id\")\n",
"person_merge.to_excel(\"person_data.xlsx\", index=False)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 30,
"outputs": [
{
"data": {
"text/plain": "array(['FI', 'NL', 'FR', 'CH', 'US', 'DE', 'DK', 'AT', 'SE', 'BE', 'CN',\n 'IT', 'LU', 'IE', 'SI', 'HK', 'MO', 'CZ', 'ES', 'NO', 'PL', 'HU',\n 'CY', 'SK', 'PT', 'EE', 'MT', 'GR', 'RO', 'BG', 'LT', 'HR', 'LV'],\n dtype=object)"
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pers[\"person_ctry_code\"].unique()"
],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}