You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
blabla/PATSTAT/patstat_data_filter_process...

702 lines
48 KiB
Plaintext

2 years ago
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
1 year ago
"CPU times: total: 125 ms\n",
"Wall time: 269 ms\n"
2 years ago
]
}
],
"source": [
"%%time\n",
"import dask"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
1 year ago
"text/plain": "<dask.config.set at 0x2ad796066a0>"
2 years ago
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dask.config.set(temporary_directory=r'D:\\PATSTAT\\dask_temp')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
1 year ago
"text/plain": "<dask.config.set at 0x2ad79986df0>"
2 years ago
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dask.config.set({'temporary_directory': r'D:\\PATSTAT\\dask_temp'})"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
1 year ago
"text/plain": "{'temporary-directory': 'D:\\\\PATSTAT\\\\dask_temp',\n 'visualization': {'engine': None},\n 'tokenize': {'ensure-deterministic': False},\n 'dataframe': {'backend': 'pandas',\n 'shuffle': {'method': None, 'compression': None},\n 'parquet': {'metadata-task-size-local': 512, 'metadata-task-size-remote': 1},\n 'dtype_backend': 'pandas',\n 'convert_string': False},\n 'array': {'backend': 'numpy',\n 'rechunk': {'method': 'tasks'},\n 'svg': {'size': 120},\n 'slicing': {'split-large-chunks': None}},\n 'optimization': {'annotations': {'fuse': True},\n 'fuse': {'active': None,\n 'ave-width': 1,\n 'max-width': None,\n 'max-height': inf,\n 'max-depth-new-edges': None,\n 'subgraphs': None,\n 'rename-keys': True}}}"
2 years ago
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dask.config.config"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import dask.dataframe as dd\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\PATSTAT\n",
"D:\\PATSTAT\n"
]
}
],
"source": [
"import os\n",
"print(os.getcwd()) # Prints the current working directory\n",
"\n",
"workdir_path=r\"D:\\PATSTAT\"\n",
"os.chdir(workdir_path)\n",
"print(os.getcwd())"
]
},
{
"cell_type": "code",
1 year ago
"execution_count": 6,
2 years ago
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# tls_201 = dd.read_csv(\"table_tls201.csv\", low_memory=False,dtype={'appln_nr': 'object',\n",
"# 'appln_nr_original': 'object'})\n",
"# tls_201.head()\n",
"# tls_206 = dd.read_csv(\"table_tls206.csv\", low_memory=False)\n",
"# tls_206.head()\n",
"# tls_207 = dd.read_csv(\"table_tls207.csv\", low_memory=False)\n",
"# tls_207.head()\n",
"# tls_207.to_parquet(\"tls_207.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
1 year ago
"text/plain": " appln_id appln_auth appln_nr appln_kind appln_filing_date \n0 0 XX None D 9999-12-31 \\\n1 1 EP 103094.0 A 2000-02-15 \n2 2 EP 107845.0 A 1992-12-02 \n3 3 EP 202556.0 A 2000-07-17 \n4 4 EP 300208.0 A 2000-01-13 \n\n appln_filing_year appln_nr_original ipr_type receiving_office \n0 9999 None PI \\\n1 2000 00103094 PI \n2 1992 00107845 PI \n3 2000 00202556 PI \n4 2000 00300208 PI \n\n internat_appln_id ... earliest_publn_date earliest_publn_year \n0 0 ... 9999-12-31 9999 \\\n1 0 ... 2000-09-20 2000 \n2 0 ... 2000-08-02 2000 \n3 0 ... 2001-01-24 2001 \n4 0 ... 2000-07-26 2000 \n\n earliest_pat_publn_id granted docdb_family_id inpadoc_family_id \n0 0 N 0 0 \\\n1 293253293 Y 8554171 1 \n2 301548848 Y 27517085 2 \n3 291964096 N 7915918 3 \n4 292901055 Y 22889365 4 \n\n docdb_family_size nb_citing_docdb_fam nb_applicants nb_inventors \n0 1 0 0 0 \n1 6 79 1 4 \n2 8 56 2 6 \n3 4 22 2 3 \n4 6 27 1 2 \n\n[5 rows x 26 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>appln_id</th>\n <th>appln_auth</th>\n <th>appln_nr</th>\n <th>appln_kind</th>\n <th>appln_filing_date</th>\n <th>appln_filing_year</th>\n <th>appln_nr_original</th>\n <th>ipr_type</th>\n <th>receiving_office</th>\n <th>internat_appln_id</th>\n <th>...</th>\n <th>earliest_publn_date</th>\n <th>earliest_publn_year</th>\n <th>earliest_pat_publn_id</th>\n <th>granted</th>\n <th>docdb_family_id</th>\n <th>inpadoc_family_id</th>\n <th>docdb_family_size</th>\n <th>nb_citing_docdb_fam</th>\n <th>nb_applicants</th>\n <th>nb_inventors</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0</td>\n <td>XX</td>\n <td>None</td>\n <td>D</td>\n <td>9999-12-31</td>\n <td>9999</td>\n <td>None</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>9999-12-31</td>\n <td>9999</td>\n <td>0</td>\n <td>N</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1</td>\n <td>EP</td>\n <td>103094.0</td>\n <td>A</td>\n <td>2000-02-15</td>\n <td>2000</td>\n <td>00103094</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>2000-09-20</td>\n <td>2000</td>\n <td>293253293</td>\n <td>Y</td>\n <td>8554171</td>\n <td>1</td>\n <td>6</td>\n <td>79</td>\n <td>1</td>\n <td>4</td>\n </tr>\n <tr>\n <th>2</th>\n <td>2</td>\n <td>EP</td>\n <td>107845.0</td>\n <td>A</td>\n <td>1992-12-02</td>\n <td>1992</td>\n <td>00107845</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>2000-08-02</td>\n <td>2000</td>\n <td>301548848</td>\n <td>Y</td>\n <td>27517085</td>\n <td>2</td>\n <td>8</td>\n <td>56</td>\n <td>2</td>\n <td>6</td>\n </tr>\n <tr>\n <th>3</th>\n <td>3</td>\n <td>EP</td>\n <td>202556.0</td>\n <td>A</td>\n <td>2000-07-17</td>\n <td>2000</td>\n <td>00202556</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>2001-01-24</td>\n <td>2001</td>\n <td>291964096</td>\n <td>N</td>\n <td>7915918</td>\n <td>3</td>\n <td>4</td>\n <td>22</td>\n <td>2</td>\n <td>3</td>\n </tr>\n <tr>\n <th>4</th>\n <td>4</td>\n <td>EP</td>\n <td>300208.0</td>\n <td>A</td>\n <td>2000-01-13</td>\n <td>2000</td>\n <td>00300208</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>2000-07-26</td>\n <td>2000</td>\n <td>292901055</td>\n <td>Y</td>\n <td>22889365</td>\n <td>4</td>\n <td>6</td>\n <td>27</td>\n <td>1</td>\n <td>2</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 26 columns</p>\n</div>"
2 years ago
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Application data\n",
"tls_201_p = dd.read_parquet(\"tls_201.parquet\")\n",
"tls_201_p.head()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# tls_201_p[((tls_201_p[\"appln_filing_year\"]>2011)&\n",
"# (tls_201_p[\"appln_filing_year\"]<2024)&\n",
"# (tls_201_p[\"granted\"]==\"Y\"))][\"appln_id\"].nunique().compute()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# tls_201_p[((tls_201_p[\"appln_filing_year\"]>2011)&\n",
"# (tls_201_p[\"appln_filing_year\"]<2024)&\n",
"# (tls_201_p[\"granted\"]==\"N\"))][\"appln_id\"].nunique().compute()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
1 year ago
"text/plain": " person_id person_name person_ctry_code\n0 1 Nokia Corporation FI\n1 2 Lipponen, Markku FI\n2 3 Laitinen, Timo FI\n3 4 Aho, Ari FI\n4 5 Knuutila, Jarno FI",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>person_id</th>\n <th>person_name</th>\n <th>person_ctry_code</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>Nokia Corporation</td>\n <td>FI</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2</td>\n <td>Lipponen, Markku</td>\n <td>FI</td>\n </tr>\n <tr>\n <th>2</th>\n <td>3</td>\n <td>Laitinen, Timo</td>\n <td>FI</td>\n </tr>\n <tr>\n <th>3</th>\n <td>4</td>\n <td>Aho, Ari</td>\n <td>FI</td>\n </tr>\n <tr>\n <th>4</th>\n <td>5</td>\n <td>Knuutila, Jarno</td>\n <td>FI</td>\n </tr>\n </tbody>\n</table>\n</div>"
2 years ago
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tls_206_p = dd.read_parquet(\"tls_206.parquet\",columns=[\"person_id\",\"person_name\",\"person_ctry_code\"])\n",
"tls_206_p.head()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
1 year ago
"text/plain": " person_id appln_id\n0 1 1\n1 1 7\n2 1 46\n3 1 775\n4 1 1192",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>person_id</th>\n <th>appln_id</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>1</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1</td>\n <td>7</td>\n </tr>\n <tr>\n <th>2</th>\n <td>1</td>\n <td>46</td>\n </tr>\n <tr>\n <th>3</th>\n <td>1</td>\n <td>775</td>\n </tr>\n <tr>\n <th>4</th>\n <td>1</td>\n <td>1192</td>\n </tr>\n </tbody>\n</table>\n</div>"
2 years ago
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tls_207_p = dd.read_parquet(\"tls_207.parquet\",columns=[\"person_id\",\"appln_id\"])\n",
"tls_207_p.head()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# tls_207_p[tls_207_p[\"appln_id\"]==1].compute()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
1 year ago
"text/plain": " ctry_code iso_alpha3 st3_name organisation_flag continent \n0 unknown Y NaN \\\n1 AD AND Andorra Europe \n2 AE ARE United Arab Emirates Asia \n3 AF AFG Afghanistan Asia \n4 AG ATG Antigua and Barbuda North America \n\n eu_member epo_member oecd_member discontinued \n0 \n1 \n2 \n3 \n4 ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>ctry_code</th>\n <th>iso_alpha3</th>\n <th>st3_name</th>\n <th>organisation_flag</th>\n <th>continent</th>\n <th>eu_member</th>\n <th>epo_member</th>\n <th>oecd_member</th>\n <th>discontinued</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td></td>\n <td></td>\n <td>unknown</td>\n <td>Y</td>\n <td>NaN</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <th>1</th>\n <td>AD</td>\n <td>AND</td>\n <td>Andorra</td>\n <td></td>\n <td>Europe</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <th>2</th>\n <td>AE</td>\n <td>ARE</td>\n <td>United Arab Emirates</td>\n <td></td>\n <td>Asia</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <th>3</th>\n <td>AF</td>\n <td>AFG</td>\n <td>Afghanistan</td>\n <td></td>\n <td>Asia</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <th>4</th>\n <td>AG</td>\n <td>ATG</td>\n <td>Antigua and Barbuda</td>\n <td></td>\n <td>North America</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n </tbody>\n</table>\n</div>"
2 years ago
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tls_801 = dd.read_csv(\"table_tls801.csv\", low_memory=False)\n",
"tls_801.head()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
1 year ago
"text/plain": " ctry_code iso_alpha3 st3_name organisation_flag \n47 CN CHN China \\\n97 HK HKG Hong Kong, China \n147 MO MAC Macao SAR (China) \n217 TW TWN Taiwan Province Of China \n\n continent eu_member epo_member oecd_member discontinued \n47 Asia \n97 Asia \n147 Asia \n217 Asia ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>ctry_code</th>\n <th>iso_alpha3</th>\n <th>st3_name</th>\n <th>organisation_flag</th>\n <th>continent</th>\n <th>eu_member</th>\n <th>epo_member</th>\n <th>oecd_member</th>\n <th>discontinued</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>47</th>\n <td>CN</td>\n <td>CHN</td>\n <td>China</td>\n <td></td>\n <td>Asia</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <th>97</th>\n <td>HK</td>\n <td>HKG</td>\n <td>Hong Kong, China</td>\n <td></td>\n <td>Asia</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <th>147</th>\n <td>MO</td>\n <td>MAC</td>\n <td>Macao SAR (China)</td>\n <td></td>\n <td>Asia</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <th>217</th>\n <td>TW</td>\n <td>TWN</td>\n <td>Taiwan Province Of China</td>\n <td></td>\n <td>Asia</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n </tbody>\n</table>\n</div>"
2 years ago
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"china_df = tls_801[tls_801.st3_name.str.lower().str.contains(\"china\")].compute()\n",
"china_df"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
1 year ago
"text/plain": " ctry_code iso_alpha3 st3_name organisation_flag continent eu_member \n12 AT AUT Austria Europe Y \\\n19 BE BEL Belgium Europe Y \n21 BG BGR Bulgaria Europe Y \n54 CY CYP Cyprus Europe Y \n55 CZ CZE Czechia Europe Y \n57 DE DEU Germany Europe Y \n59 DK DNK Denmark Europe Y \n66 EE EST Estonia Europe Y \n72 ES ESP Spain Europe Y \n75 FI FIN Finland Europe Y \n79 FR FRA France Europe Y \n92 GR GRC Greece Europe Y \n99 HR HRV Croatia Europe Y \n101 HU HUN Hungary Europe Y \n104 IE IRL Ireland Europe Y \n111 IT ITA Italy Europe Y \n134 LT LTU Lithuania Europe Y \n135 LU LUX Luxembourg Europe Y \n136 LV LVA Latvia Europe Y \n151 MT MLT Malta Europe Y \n162 NL NLD Netherlands Europe Y \n174 PL POL Poland Europe Y \n175 PT PRT Portugal Europe Y \n180 RO ROU Romania Europe Y \n188 SE SWE Sweden Europe Y \n191 SI SVN Slovenia Europe Y \n192 SK SVK Slovakia Europe Y \n\n epo_member oecd_member discontinued \n12 Y Y \n19 Y Y \n21 Y \n54 Y \n55 Y Y \n57 Y Y \n59 Y Y \n66 Y Y \n72 Y Y \n75 Y Y \n79 Y Y \n92 Y Y \n99 Y \n101 Y Y \n104 Y Y \n111 Y Y \n134 Y Y \n135 Y Y \n136 Y Y \n151 Y \n162 Y Y \n174 Y Y \n175 Y Y \n180 Y \n188 Y Y \n191 Y Y \n192 Y Y ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>ctry_code</th>\n <th>iso_alpha3</th>\n <th>st3_name</th>\n <th>organisation_flag</th>\n <th>continent</th>\n <th>eu_member</th>\n <th>epo_member</th>\n <th>oecd_member</th>\n <th>discontinued</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>12</th>\n <td>AT</td>\n <td>AUT</td>\n <td>Austria</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>19</th>\n <td>BE</td>\n <td>BEL</td>\n <td>Belgium</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>21</th>\n <td>BG</td>\n <td>BGR</td>\n <td>Bulgaria</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <th>54</th>\n <td>CY</td>\n <td>CYP</td>\n <td>Cyprus</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <th>55</th>\n <td>CZ</td>\n <td>CZE</td>\n <td>Czechia</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>57</th>\n <td>DE</td>\n <td>DEU</td>\n <td>Germany</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>59</th>\n <td>DK</td>\n <td>DNK</td>\n <td>Denmark</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>66</th>\n <td>EE</td>\n <td>EST</td>\n <td>Estonia</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>72</th>\n <td>ES</td>\n <td>ESP</td>\n <td>Spain</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>75</th>\n <td>FI</td>\n <td>FIN</td>\n <td>Finland</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>79</th>\n <td>FR</td>\n <td>FRA</td>\n <td>France</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>92</th>\n <td>GR</td>\n <td>GRC</td>\n <td>Greece</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>99</th>\n <td>HR</td>\n <td>HRV</td>\n <td>Croatia</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <th>101</th>\n <td>HU</td>\n <td>HUN</td>\n <td>Hungary</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>104</th>\n <td>IE</td>\n <td>IRL</td>\n <td>Ireland</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>111</th>\n <td>IT</td>\n <td>ITA</td>\n <td>Italy</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <
2 years ago
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"eu_df = tls_801[tls_801.eu_member==\"Y\"].compute()\n",
"eu_df"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"ctry_list=list(china_df[\"ctry_code\"])+list(eu_df[\"ctry_code\"])"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: total: 0 ns\n",
1 year ago
"Wall time: 0 ns\n"
2 years ago
]
}
],
"source": [
"%%time\n",
1 year ago
"tls_appln_interval = tls_201_p[((tls_201_p[\"appln_filing_year\"]>2010)&\n",
" (tls_201_p[\"appln_filing_year\"]<2024))][\"appln_id\"].unique()"
2 years ago
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"tls_206_p_subgroup = tls_206_p[tls_206_p[\"person_ctry_code\"].isin(ctry_list)][[\"person_id\",\"person_ctry_code\"]]"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
1 year ago
"CPU times: total: 59.8 s\n",
"Wall time: 33.3 s\n"
2 years ago
]
}
],
"source": [
"%%time\n",
"appln_pers = (tls_207_p[tls_207_p['appln_id'].isin(tls_appln_interval.compute())]\n",
" ).merge(\n",
" tls_206_p_subgroup,\n",
" on=\"person_id\",how=\"inner\")[[\"appln_id\",\"person_id\",\"person_ctry_code\"]].drop_duplicates()\n",
"\n",
"appln_pers = appln_pers[appln_pers[\"person_ctry_code\"].isin(ctry_list)].drop_duplicates()\n"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
1 year ago
"text/plain": "Index(['appln_id', 'person_id', 'person_ctry_code'], dtype='object')"
2 years ago
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"appln_pers.columns"
]
},
{
"cell_type": "code",
1 year ago
"execution_count": 22,
2 years ago
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"appln_pers.to_parquet(\"appln_pers.parquet\")"
]
},
{
"cell_type": "code",
1 year ago
"execution_count": 23,
2 years ago
"metadata": {},
"outputs": [
{
"data": {
1 year ago
"text/plain": "5221245"
2 years ago
},
1 year ago
"execution_count": 23,
2 years ago
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"appln_pers[\"appln_id\"].nunique().compute()"
]
},
{
"cell_type": "code",
1 year ago
"execution_count": 24,
2 years ago
"metadata": {},
"outputs": [],
"source": [
"id_selector = dd.read_parquet(\"appln_pers.parquet\")"
]
},
{
"cell_type": "code",
1 year ago
"execution_count": 25,
2 years ago
"metadata": {},
"outputs": [
{
"data": {
1 year ago
"text/plain": " appln_id person_id person_ctry_code\n0 535761830 76992434 DE\n1 529397031 76992470 CN\n2 529397099 76992470 CN\n3 410337532 76993145 DE\n4 528363154 76993242 DE",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>appln_id</th>\n <th>person_id</th>\n <th>person_ctry_code</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>535761830</td>\n <td>76992434</td>\n <td>DE</td>\n </tr>\n <tr>\n <th>1</th>\n <td>529397031</td>\n <td>76992470</td>\n <td>CN</td>\n </tr>\n <tr>\n <th>2</th>\n <td>529397099</td>\n <td>76992470</td>\n <td>CN</td>\n </tr>\n <tr>\n <th>3</th>\n <td>410337532</td>\n <td>76993145</td>\n <td>DE</td>\n </tr>\n <tr>\n <th>4</th>\n <td>528363154</td>\n <td>76993242</td>\n <td>DE</td>\n </tr>\n </tbody>\n</table>\n</div>"
2 years ago
},
1 year ago
"execution_count": 25,
2 years ago
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"id_selector.head()"
]
},
{
"cell_type": "code",
1 year ago
"execution_count": 26,
2 years ago
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
1 year ago
"CPU times: total: 11.6 s\n",
"Wall time: 11.8 s\n"
2 years ago
]
}
],
"source": [
"%%time\n",
"eu_id = id_selector[id_selector[\"person_ctry_code\"].isin(list(eu_df[\"ctry_code\"]))][\"appln_id\"].unique()\n",
"china_id = id_selector[id_selector[\"person_ctry_code\"].isin(list(china_df[\"ctry_code\"]))][\"appln_id\"].unique()\n",
" \n",
"common_id = id_selector[id_selector[\"appln_id\"].isin(eu_id.compute())&\n",
" id_selector[\"appln_id\"].isin(china_id.compute())]\n",
"\n",
"common_id.to_parquet(\"common_id_CHEU.parquet\")"
]
},
{
"cell_type": "code",
1 year ago
"execution_count": 27,
2 years ago
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
1 year ago
"CPU times: total: 2.55 s\n",
"Wall time: 2.59 s\n"
2 years ago
]
},
{
"data": {
1 year ago
"text/plain": "64266"
2 years ago
},
1 year ago
"execution_count": 27,
2 years ago
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"common_id[\"appln_id\"].nunique().compute()"
]
},
{
"cell_type": "code",
1 year ago
"execution_count": 28,
2 years ago
"metadata": {},
"outputs": [],
"source": [
"filtered_ids = dd.read_parquet(\"common_id_CHEU.parquet\")"
]
},
{
"cell_type": "code",
1 year ago
"execution_count": 29,
2 years ago
"metadata": {},
"outputs": [
{
"data": {
1 year ago
"text/plain": " appln_id person_id person_ctry_code\n125 531979430 77025643 CN\n138 533281318 77040504 DK\n139 557203984 77040504 DK\n140 558802816 77040504 DK\n141 575538984 77040504 DK",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>appln_id</th>\n <th>person_id</th>\n <th>person_ctry_code</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>125</th>\n <td>531979430</td>\n <td>77025643</td>\n <td>CN</td>\n </tr>\n <tr>\n <th>138</th>\n <td>533281318</td>\n <td>77040504</td>\n <td>DK</td>\n </tr>\n <tr>\n <th>139</th>\n <td>557203984</td>\n <td>77040504</td>\n <td>DK</td>\n </tr>\n <tr>\n <th>140</th>\n <td>558802816</td>\n <td>77040504</td>\n <td>DK</td>\n </tr>\n <tr>\n <th>141</th>\n <td>575538984</td>\n <td>77040504</td>\n <td>DK</td>\n </tr>\n </tbody>\n</table>\n</div>"
2 years ago
},
1 year ago
"execution_count": 29,
2 years ago
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"filtered_ids.head()"
]
},
{
"cell_type": "code",
1 year ago
"execution_count": 30,
2 years ago
"metadata": {},
"outputs": [
{
"data": {
1 year ago
"text/plain": " appln_id person_id person_ctry_code\n22116 330225325 4555802 DE\n18943 330225325 429997 CN\n21222 330322632 4853096 CN\n661 330322632 524 DE\n6945 330322632 4853097 DE\n... ... ... ...\n32498 575556091 13012618 CN\n44415 575556091 53072159 CN\n73579 575556091 85317179 CN\n17265 575556091 12469480 FR\n56058 575556091 53831662 CN\n\n[274001 rows x 3 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>appln_id</th>\n <th>person_id</th>\n <th>person_ctry_code</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>22116</th>\n <td>330225325</td>\n <td>4555802</td>\n <td>DE</td>\n </tr>\n <tr>\n <th>18943</th>\n <td>330225325</td>\n <td>429997</td>\n <td>CN</td>\n </tr>\n <tr>\n <th>21222</th>\n <td>330322632</td>\n <td>4853096</td>\n <td>CN</td>\n </tr>\n <tr>\n <th>661</th>\n <td>330322632</td>\n <td>524</td>\n <td>DE</td>\n </tr>\n <tr>\n <th>6945</th>\n <td>330322632</td>\n <td>4853097</td>\n <td>DE</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>32498</th>\n <td>575556091</td>\n <td>13012618</td>\n <td>CN</td>\n </tr>\n <tr>\n <th>44415</th>\n <td>575556091</td>\n <td>53072159</td>\n <td>CN</td>\n </tr>\n <tr>\n <th>73579</th>\n <td>575556091</td>\n <td>85317179</td>\n <td>CN</td>\n </tr>\n <tr>\n <th>17265</th>\n <td>575556091</td>\n <td>12469480</td>\n <td>FR</td>\n </tr>\n <tr>\n <th>56058</th>\n <td>575556091</td>\n <td>53831662</td>\n <td>CN</td>\n </tr>\n </tbody>\n</table>\n<p>274001 rows × 3 columns</p>\n</div>"
2 years ago
},
1 year ago
"execution_count": 30,
2 years ago
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"filtered_ids.sort_values(by=\"appln_id\").compute()"
]
},
{
"cell_type": "code",
1 year ago
"execution_count": 31,
2 years ago
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
1 year ago
"CPU times: total: 46.9 ms\n",
"Wall time: 68.1 ms\n"
2 years ago
]
}
],
"source": [
"%%time\n",
"id_scope=filtered_ids[\"appln_id\"].unique().compute()\n",
"pers_id_scope=filtered_ids[\"person_id\"].unique().compute()"
]
},
{
"cell_type": "code",
1 year ago
"execution_count": 32,
2 years ago
"metadata": {},
"outputs": [
{
"data": {
1 year ago
"text/plain": " appln_id appln_auth appln_nr appln_kind appln_filing_date \n0 0 XX None D 9999-12-31 \\\n1 1 EP 103094.0 A 2000-02-15 \n2 2 EP 107845.0 A 1992-12-02 \n3 3 EP 202556.0 A 2000-07-17 \n4 4 EP 300208.0 A 2000-01-13 \n\n appln_filing_year appln_nr_original ipr_type receiving_office \n0 9999 None PI \\\n1 2000 00103094 PI \n2 1992 00107845 PI \n3 2000 00202556 PI \n4 2000 00300208 PI \n\n internat_appln_id ... earliest_publn_date earliest_publn_year \n0 0 ... 9999-12-31 9999 \\\n1 0 ... 2000-09-20 2000 \n2 0 ... 2000-08-02 2000 \n3 0 ... 2001-01-24 2001 \n4 0 ... 2000-07-26 2000 \n\n earliest_pat_publn_id granted docdb_family_id inpadoc_family_id \n0 0 N 0 0 \\\n1 293253293 Y 8554171 1 \n2 301548848 Y 27517085 2 \n3 291964096 N 7915918 3 \n4 292901055 Y 22889365 4 \n\n docdb_family_size nb_citing_docdb_fam nb_applicants nb_inventors \n0 1 0 0 0 \n1 6 79 1 4 \n2 8 56 2 6 \n3 4 22 2 3 \n4 6 27 1 2 \n\n[5 rows x 26 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>appln_id</th>\n <th>appln_auth</th>\n <th>appln_nr</th>\n <th>appln_kind</th>\n <th>appln_filing_date</th>\n <th>appln_filing_year</th>\n <th>appln_nr_original</th>\n <th>ipr_type</th>\n <th>receiving_office</th>\n <th>internat_appln_id</th>\n <th>...</th>\n <th>earliest_publn_date</th>\n <th>earliest_publn_year</th>\n <th>earliest_pat_publn_id</th>\n <th>granted</th>\n <th>docdb_family_id</th>\n <th>inpadoc_family_id</th>\n <th>docdb_family_size</th>\n <th>nb_citing_docdb_fam</th>\n <th>nb_applicants</th>\n <th>nb_inventors</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0</td>\n <td>XX</td>\n <td>None</td>\n <td>D</td>\n <td>9999-12-31</td>\n <td>9999</td>\n <td>None</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>9999-12-31</td>\n <td>9999</td>\n <td>0</td>\n <td>N</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1</td>\n <td>EP</td>\n <td>103094.0</td>\n <td>A</td>\n <td>2000-02-15</td>\n <td>2000</td>\n <td>00103094</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>2000-09-20</td>\n <td>2000</td>\n <td>293253293</td>\n <td>Y</td>\n <td>8554171</td>\n <td>1</td>\n <td>6</td>\n <td>79</td>\n <td>1</td>\n <td>4</td>\n </tr>\n <tr>\n <th>2</th>\n <td>2</td>\n <td>EP</td>\n <td>107845.0</td>\n <td>A</td>\n <td>1992-12-02</td>\n <td>1992</td>\n <td>00107845</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>2000-08-02</td>\n <td>2000</td>\n <td>301548848</td>\n <td>Y</td>\n <td>27517085</td>\n <td>2</td>\n <td>8</td>\n <td>56</td>\n <td>2</td>\n <td>6</td>\n </tr>\n <tr>\n <th>3</th>\n <td>3</td>\n <td>EP</td>\n <td>202556.0</td>\n <td>A</td>\n <td>2000-07-17</td>\n <td>2000</td>\n <td>00202556</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>2001-01-24</td>\n <td>2001</td>\n <td>291964096</td>\n <td>N</td>\n <td>7915918</td>\n <td>3</td>\n <td>4</td>\n <td>22</td>\n <td>2</td>\n <td>3</td>\n </tr>\n <tr>\n <th>4</th>\n <td>4</td>\n <td>EP</td>\n <td>300208.0</td>\n <td>A</td>\n <td>2000-01-13</td>\n <td>2000</td>\n <td>00300208</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>2000-07-26</td>\n <td>2000</td>\n <td>292901055</td>\n <td>Y</td>\n <td>22889365</td>\n <td>4</td>\n <td>6</td>\n <td>27</td>\n <td>1</td>\n <td>2</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 26 columns</p>\n</div>"
2 years ago
},
1 year ago
"execution_count": 32,
2 years ago
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tls_201_p.head()"
]
},
{
"cell_type": "code",
1 year ago
"execution_count": 33,
2 years ago
"metadata": {},
"outputs": [],
"source": [
"outdir = \"EU_CH_scope\"\n",
"os.makedirs(outdir, exist_ok=True)"
]
},
{
"cell_type": "code",
1 year ago
"execution_count": 34,
2 years ago
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
1 year ago
"CPU times: total: 4min 8s\n",
"Wall time: 2min 42s\n"
2 years ago
]
}
],
"source": [
"%%time\n",
"#Application data\n",
"tls_201_p = dd.read_parquet(\"tls_201.parquet\")\n",
"tls_201_scope = tls_201_p[tls_201_p['appln_id'].isin(id_scope)]\n",
"tls_201_scope.compute().to_csv(f\"{outdir}/tls_201_scope.csv\", index=False)"
]
},
{
"cell_type": "code",
1 year ago
"execution_count": 35,
2 years ago
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
1 year ago
"CPU times: total: 1min 15s\n",
"Wall time: 23.9 s\n"
2 years ago
]
}
],
"source": [
"%%time\n",
"#Person-appln data\n",
"tls_207_p = dd.read_parquet(\"tls_207.parquet\")\n",
"tls_207_scope = tls_207_p[((tls_207_p['person_id'].isin(pers_id_scope))&\n",
" (tls_207_p['appln_id'].isin(id_scope)))]\n",
"tls_207_scope.compute().to_csv(f\"{outdir}/tls_207_scope.csv\",index=False)"
]
},
{
"cell_type": "code",
1 year ago
"execution_count": 36,
2 years ago
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
1 year ago
"CPU times: total: 4min 13s\n",
"Wall time: 3min 3s\n"
2 years ago
]
}
],
"source": [
"%%time\n",
"#Person data\n",
"tls_206_p = dd.read_parquet(\"tls_206.parquet\")\n",
"tls_206_scope = tls_206_p[tls_206_p['person_id'].isin(pers_id_scope)]\n",
"tls_206_scope.compute().to_csv(f\"{outdir}/tls_206_scope.csv\",index=False)"
]
},
{
"cell_type": "code",
1 year ago
"execution_count": 37,
2 years ago
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
1 year ago
"CPU times: total: 2min 8s\n",
"Wall time: 1min 29s\n"
2 years ago
]
}
],
"source": [
"%%time\n",
"#Application title data\n",
"tls_202_p = dd.read_csv(\"table_tls202.csv\")\n",
"tls_202_scope = tls_202_p[tls_202_p['appln_id'].isin(id_scope)]\n",
"tls_202_scope.compute().to_csv(f\"{outdir}/tls_202_scope.csv\", index=False)"
]
},
{
"cell_type": "code",
1 year ago
"execution_count": 38,
2 years ago
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
1 year ago
"CPU times: total: 2min 36s\n",
"Wall time: 1min 28s\n"
2 years ago
]
}
],
"source": [
"%%time\n",
"#IPC data\n",
"tls_224_p = dd.read_csv(\"table_tls224.csv\")\n",
"tls_224_p_scope = tls_224_p[tls_224_p['appln_id'].isin(id_scope)]\n",
"tls_224_p_scope.compute().to_csv(f\"{outdir}/tls_224_scope.csv\", index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
}
},
"nbformat": 4,
"nbformat_minor": 1
}