You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ZSI_Reconnect_China/PATSTAT/patstat_data_filter_process...

821 lines
64 KiB
Plaintext

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: total: 125 ms\n",
"Wall time: 143 ms\n"
]
}
],
"source": [
"%%time\n",
"import dask"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "<dask.config.set at 0x197d05e7220>"
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dask.config.set(temporary_directory=r'D:\\PATSTAT\\dask_temp')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "<dask.config.set at 0x197d0c03ac0>"
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dask.config.set({'temporary_directory': r'D:\\PATSTAT\\dask_temp'})"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "{'temporary-directory': 'D:\\\\PATSTAT\\\\dask_temp',\n 'visualization': {'engine': None},\n 'tokenize': {'ensure-deterministic': False},\n 'dataframe': {'backend': 'pandas',\n 'shuffle': {'method': None, 'compression': None},\n 'parquet': {'metadata-task-size-local': 512, 'metadata-task-size-remote': 1},\n 'dtype_backend': 'pandas',\n 'convert_string': False},\n 'array': {'backend': 'numpy',\n 'rechunk': {'method': 'tasks'},\n 'svg': {'size': 120},\n 'slicing': {'split-large-chunks': None}},\n 'optimization': {'annotations': {'fuse': True},\n 'fuse': {'active': None,\n 'ave-width': 1,\n 'max-width': None,\n 'max-height': inf,\n 'max-depth-new-edges': None,\n 'subgraphs': None,\n 'rename-keys': True}}}"
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dask.config.config"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import dask.dataframe as dd\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\PATSTAT\n",
"D:\\PATSTAT\n"
]
}
],
"source": [
"import os\n",
"print(os.getcwd()) # Prints the current working directory\n",
"\n",
"workdir_path=r\"D:\\PATSTAT\"\n",
"os.chdir(workdir_path)\n",
"print(os.getcwd())"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: total: 41min 20s\n",
"Wall time: 18min 32s\n"
]
}
],
"source": [
"%%time\n",
"tls_201 = dd.read_csv(\"table_tls201.csv\", low_memory=False,dtype={'appln_nr': 'object',\n",
" 'appln_nr_original': 'object'})\n",
"tls_201.to_parquet(\"tls_201.parquet\")\n",
"\n",
"tls_206 = dd.read_csv(\"table_tls206.csv\", low_memory=False)\n",
"tls_206.to_parquet(\"tls_206.parquet\")\n",
"\n",
"tls_207 = dd.read_csv(\"table_tls207.csv\", low_memory=False)\n",
"tls_207.to_parquet(\"tls_207.parquet\")\n",
"\n",
"tls_204 = dd.read_csv(\"table_tls204.csv\", low_memory=False)\n",
"tls_207.to_parquet(\"tls_204.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": " appln_id appln_auth appln_nr appln_kind appln_filing_date \n0 0 XX None D 9999-12-31 \\\n1 1 EP 103094.0 A 2000-02-15 \n2 2 EP 107845.0 A 1992-12-02 \n3 3 EP 202556.0 A 2000-07-17 \n4 4 EP 300208.0 A 2000-01-13 \n\n appln_filing_year appln_nr_original ipr_type receiving_office \n0 9999 None PI \\\n1 2000 00103094 PI \n2 1992 00107845 PI \n3 2000 00202556 PI \n4 2000 00300208 PI \n\n internat_appln_id ... earliest_publn_date earliest_publn_year \n0 0 ... 9999-12-31 9999 \\\n1 0 ... 2000-09-20 2000 \n2 0 ... 2000-08-02 2000 \n3 0 ... 2001-01-24 2001 \n4 0 ... 2000-07-26 2000 \n\n earliest_pat_publn_id granted docdb_family_id inpadoc_family_id \n0 0 N 0 0 \\\n1 293253293 Y 8554171 1 \n2 301548848 Y 27517085 2 \n3 291964096 N 7915918 3 \n4 292901055 Y 22889365 4 \n\n docdb_family_size nb_citing_docdb_fam nb_applicants nb_inventors \n0 1 0 0 0 \n1 6 79 1 4 \n2 8 60 2 6 \n3 4 22 2 3 \n4 6 27 1 2 \n\n[5 rows x 26 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>appln_id</th>\n <th>appln_auth</th>\n <th>appln_nr</th>\n <th>appln_kind</th>\n <th>appln_filing_date</th>\n <th>appln_filing_year</th>\n <th>appln_nr_original</th>\n <th>ipr_type</th>\n <th>receiving_office</th>\n <th>internat_appln_id</th>\n <th>...</th>\n <th>earliest_publn_date</th>\n <th>earliest_publn_year</th>\n <th>earliest_pat_publn_id</th>\n <th>granted</th>\n <th>docdb_family_id</th>\n <th>inpadoc_family_id</th>\n <th>docdb_family_size</th>\n <th>nb_citing_docdb_fam</th>\n <th>nb_applicants</th>\n <th>nb_inventors</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0</td>\n <td>XX</td>\n <td>None</td>\n <td>D</td>\n <td>9999-12-31</td>\n <td>9999</td>\n <td>None</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>9999-12-31</td>\n <td>9999</td>\n <td>0</td>\n <td>N</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1</td>\n <td>EP</td>\n <td>103094.0</td>\n <td>A</td>\n <td>2000-02-15</td>\n <td>2000</td>\n <td>00103094</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>2000-09-20</td>\n <td>2000</td>\n <td>293253293</td>\n <td>Y</td>\n <td>8554171</td>\n <td>1</td>\n <td>6</td>\n <td>79</td>\n <td>1</td>\n <td>4</td>\n </tr>\n <tr>\n <th>2</th>\n <td>2</td>\n <td>EP</td>\n <td>107845.0</td>\n <td>A</td>\n <td>1992-12-02</td>\n <td>1992</td>\n <td>00107845</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>2000-08-02</td>\n <td>2000</td>\n <td>301548848</td>\n <td>Y</td>\n <td>27517085</td>\n <td>2</td>\n <td>8</td>\n <td>60</td>\n <td>2</td>\n <td>6</td>\n </tr>\n <tr>\n <th>3</th>\n <td>3</td>\n <td>EP</td>\n <td>202556.0</td>\n <td>A</td>\n <td>2000-07-17</td>\n <td>2000</td>\n <td>00202556</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>2001-01-24</td>\n <td>2001</td>\n <td>291964096</td>\n <td>N</td>\n <td>7915918</td>\n <td>3</td>\n <td>4</td>\n <td>22</td>\n <td>2</td>\n <td>3</td>\n </tr>\n <tr>\n <th>4</th>\n <td>4</td>\n <td>EP</td>\n <td>300208.0</td>\n <td>A</td>\n <td>2000-01-13</td>\n <td>2000</td>\n <td>00300208</td>\n <td>PI</td>\n <td></td>\n <td>0</td>\n <td>...</td>\n <td>2000-07-26</td>\n <td>2000</td>\n <td>292901055</td>\n <td>Y</td>\n <td>22889365</td>\n <td>4</td>\n <td>6</td>\n <td>27</td>\n <td>1</td>\n <td>2</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 26 columns</p>\n</div>"
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Application data\n",
"tls_201_p = dd.read_parquet(\"tls_201.parquet\")\n",
"tls_201_p.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# tls_201_p[((tls_201_p[\"appln_filing_year\"]>2011)&\n",
"# (tls_201_p[\"appln_filing_year\"]<2024)&\n",
"# (tls_201_p[\"granted\"]==\"Y\"))][\"appln_id\"].nunique().compute()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# tls_201_p[((tls_201_p[\"appln_filing_year\"]>2011)&\n",
"# (tls_201_p[\"appln_filing_year\"]<2024)&\n",
"# (tls_201_p[\"granted\"]==\"N\"))][\"appln_id\"].nunique().compute()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": " person_id person_name person_ctry_code\n0 1 Nokia Corporation FI\n1 2 Lipponen, Markku FI\n2 3 Laitinen, Timo FI\n3 4 Aho, Ari FI\n4 5 Knuutila, Jarno FI",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>person_id</th>\n <th>person_name</th>\n <th>person_ctry_code</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>Nokia Corporation</td>\n <td>FI</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2</td>\n <td>Lipponen, Markku</td>\n <td>FI</td>\n </tr>\n <tr>\n <th>2</th>\n <td>3</td>\n <td>Laitinen, Timo</td>\n <td>FI</td>\n </tr>\n <tr>\n <th>3</th>\n <td>4</td>\n <td>Aho, Ari</td>\n <td>FI</td>\n </tr>\n <tr>\n <th>4</th>\n <td>5</td>\n <td>Knuutila, Jarno</td>\n <td>FI</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tls_206_p = dd.read_parquet(\"tls_206.parquet\",columns=[\"person_id\",\"person_name\",\"person_ctry_code\"])\n",
"tls_206_p.head()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": " person_id appln_id\n0 1 1\n1 1 7\n2 1 46\n3 1 775\n4 1 1192",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>person_id</th>\n <th>appln_id</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>1</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1</td>\n <td>7</td>\n </tr>\n <tr>\n <th>2</th>\n <td>1</td>\n <td>46</td>\n </tr>\n <tr>\n <th>3</th>\n <td>1</td>\n <td>775</td>\n </tr>\n <tr>\n <th>4</th>\n <td>1</td>\n <td>1192</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tls_207_p = dd.read_parquet(\"tls_207.parquet\",columns=[\"person_id\",\"appln_id\"])\n",
"tls_207_p.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# tls_207_p[tls_207_p[\"appln_id\"]==1].compute()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": " ctry_code iso_alpha3 st3_name organisation_flag continent \n0 AD AND Andorra Europe \\\n1 AE ARE United Arab Emirates Asia \n2 AF AFG Afghanistan Asia \n3 AG ATG Antigua and Barbuda North America \n4 AI AIA Anguilla North America \n\n eu_member epo_member oecd_member discontinued \n0 \n1 \n2 \n3 \n4 ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>ctry_code</th>\n <th>iso_alpha3</th>\n <th>st3_name</th>\n <th>organisation_flag</th>\n <th>continent</th>\n <th>eu_member</th>\n <th>epo_member</th>\n <th>oecd_member</th>\n <th>discontinued</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>AD</td>\n <td>AND</td>\n <td>Andorra</td>\n <td></td>\n <td>Europe</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <th>1</th>\n <td>AE</td>\n <td>ARE</td>\n <td>United Arab Emirates</td>\n <td></td>\n <td>Asia</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <th>2</th>\n <td>AF</td>\n <td>AFG</td>\n <td>Afghanistan</td>\n <td></td>\n <td>Asia</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <th>3</th>\n <td>AG</td>\n <td>ATG</td>\n <td>Antigua and Barbuda</td>\n <td></td>\n <td>North America</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <th>4</th>\n <td>AI</td>\n <td>AIA</td>\n <td>Anguilla</td>\n <td></td>\n <td>North America</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tls_801 = dd.read_csv(\"table_tls801.csv\", low_memory=False)\n",
"tls_801.head()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": " ctry_code iso_alpha3 st3_name organisation_flag continent \n46 CN CHN China Asia \\\n96 HK HKG Hong Kong, China Asia \n146 MO MAC Macao, China Asia \n\n eu_member epo_member oecd_member discontinued \n46 \n96 \n146 ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>ctry_code</th>\n <th>iso_alpha3</th>\n <th>st3_name</th>\n <th>organisation_flag</th>\n <th>continent</th>\n <th>eu_member</th>\n <th>epo_member</th>\n <th>oecd_member</th>\n <th>discontinued</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>46</th>\n <td>CN</td>\n <td>CHN</td>\n <td>China</td>\n <td></td>\n <td>Asia</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <th>96</th>\n <td>HK</td>\n <td>HKG</td>\n <td>Hong Kong, China</td>\n <td></td>\n <td>Asia</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <th>146</th>\n <td>MO</td>\n <td>MAC</td>\n <td>Macao, China</td>\n <td></td>\n <td>Asia</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"china_df = tls_801[((tls_801.st3_name.str.lower().str.contains(\"china\"))&\n",
" ~(tls_801.st3_name.str.lower().str.contains(\"taiwan\")))].compute()\n",
"china_df"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": " ctry_code iso_alpha3 st3_name organisation_flag continent eu_member \n11 AT AUT Austria Europe Y \\\n18 BE BEL Belgium Europe Y \n20 BG BGR Bulgaria Europe Y \n53 CY CYP Cyprus Europe Y \n54 CZ CZE Czechia Europe Y \n56 DE DEU Germany Europe Y \n58 DK DNK Denmark Europe Y \n65 EE EST Estonia Europe Y \n71 ES ESP Spain Europe Y \n74 FI FIN Finland Europe Y \n78 FR FRA France Europe Y \n91 GR GRC Greece Europe Y \n98 HR HRV Croatia Europe Y \n100 HU HUN Hungary Europe Y \n103 IE IRL Ireland Europe Y \n110 IT ITA Italy Europe Y \n133 LT LTU Lithuania Europe Y \n134 LU LUX Luxembourg Europe Y \n135 LV LVA Latvia Europe Y \n150 MT MLT Malta Europe Y \n161 NL NLD Netherlands Europe Y \n173 PL POL Poland Europe Y \n174 PT PRT Portugal Europe Y \n179 RO ROU Romania Europe Y \n187 SE SWE Sweden Europe Y \n190 SI SVN Slovenia Europe Y \n191 SK SVK Slovakia Europe Y \n\n epo_member oecd_member discontinued \n11 Y Y \n18 Y Y \n20 Y \n53 Y \n54 Y Y \n56 Y Y \n58 Y Y \n65 Y Y \n71 Y Y \n74 Y Y \n78 Y Y \n91 Y Y \n98 Y \n100 Y Y \n103 Y Y \n110 Y Y \n133 Y Y \n134 Y Y \n135 Y Y \n150 Y \n161 Y Y \n173 Y Y \n174 Y Y \n179 Y \n187 Y Y \n190 Y Y \n191 Y Y ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>ctry_code</th>\n <th>iso_alpha3</th>\n <th>st3_name</th>\n <th>organisation_flag</th>\n <th>continent</th>\n <th>eu_member</th>\n <th>epo_member</th>\n <th>oecd_member</th>\n <th>discontinued</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>11</th>\n <td>AT</td>\n <td>AUT</td>\n <td>Austria</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>18</th>\n <td>BE</td>\n <td>BEL</td>\n <td>Belgium</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>20</th>\n <td>BG</td>\n <td>BGR</td>\n <td>Bulgaria</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <th>53</th>\n <td>CY</td>\n <td>CYP</td>\n <td>Cyprus</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <th>54</th>\n <td>CZ</td>\n <td>CZE</td>\n <td>Czechia</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>56</th>\n <td>DE</td>\n <td>DEU</td>\n <td>Germany</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>58</th>\n <td>DK</td>\n <td>DNK</td>\n <td>Denmark</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>65</th>\n <td>EE</td>\n <td>EST</td>\n <td>Estonia</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>71</th>\n <td>ES</td>\n <td>ESP</td>\n <td>Spain</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>74</th>\n <td>FI</td>\n <td>FIN</td>\n <td>Finland</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>78</th>\n <td>FR</td>\n <td>FRA</td>\n <td>France</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>91</th>\n <td>GR</td>\n <td>GRC</td>\n <td>Greece</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>98</th>\n <td>HR</td>\n <td>HRV</td>\n <td>Croatia</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <th>100</th>\n <td>HU</td>\n <td>HUN</td>\n <td>Hungary</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>103</th>\n <td>IE</td>\n <td>IRL</td>\n <td>Ireland</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>110</th>\n <td>IT</td>\n <td>ITA</td>\n <td>Italy</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>133</th>\n <td>LT</td>\n <td>LTU</td>\n <td>Lithuania</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>134</th>\n <td>LU</td>\n <td>LUX</td>\n <td>Luxembourg</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>135</th>\n <td>LV</td>\n <td>LVA</td>\n <td>Latvia</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>150</th>\n <td>MT</td>\n <td>MLT</td>\n <td>Malta</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <th>161</th>\n <td>NL</td>\n <td>NLD</td>\n <td>Netherlands</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>173</th>\n <td>PL</td>\n <td>POL</td>\n <td>Poland</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>174</th>\n <td>PT</td>\n <td>PRT</td>\n <td>Portugal</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>179</th>\n <td>RO</td>\n <td>ROU</td>\n <td>Romania</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <th>187</th>\n <td>SE</td>\n <td>SWE</td>\n <td>Sweden</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>190</th>\n <td>SI</td>\n <td>SVN</td>\n <td>Slovenia</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>191</th>\n <td>SK</td>\n <td>SVK</td>\n <td>Slovakia</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"eu_df = tls_801[tls_801.eu_member==\"Y\"].compute()\n",
"eu_df"
]
},
{
"cell_type": "code",
"execution_count": 19,
"outputs": [
{
"data": {
"text/plain": " ctry_code iso_alpha3 st3_name organisation_flag \n11 AT AUT Austria \\\n18 BE BEL Belgium \n20 BG BGR Bulgaria \n41 CH CHE Switzerland \n53 CY CYP Cyprus \n54 CZ CZE Czechia \n56 DE DEU Germany \n58 DK DNK Denmark \n65 EE EST Estonia \n71 ES ESP Spain \n74 FI FIN Finland \n78 FR FRA France \n91 GR GRC Greece \n98 HR HRV Croatia \n100 HU HUN Hungary \n103 IE IRL Ireland \n110 IT ITA Italy \n133 LT LTU Lithuania \n134 LU LUX Luxembourg \n135 LV LVA Latvia \n150 MT MLT Malta \n161 NL NLD Netherlands \n162 NO NOR Norway \n173 PL POL Poland \n174 PT PRT Portugal \n179 RO ROU Romania \n187 SE SWE Sweden \n190 SI SVN Slovenia \n191 SK SVK Slovakia \n220 US USA United States of America \n\n continent eu_member epo_member oecd_member discontinued \n11 Europe Y Y Y \n18 Europe Y Y Y \n20 Europe Y Y \n41 Europe Y Y \n53 Europe Y Y \n54 Europe Y Y Y \n56 Europe Y Y Y \n58 Europe Y Y Y \n65 Europe Y Y Y \n71 Europe Y Y Y \n74 Europe Y Y Y \n78 Europe Y Y Y \n91 Europe Y Y Y \n98 Europe Y Y \n100 Europe Y Y Y \n103 Europe Y Y Y \n110 Europe Y Y Y \n133 Europe Y Y Y \n134 Europe Y Y Y \n135 Europe Y Y Y \n150 Europe Y Y \n161 Europe Y Y Y \n162 Europe Y Y \n173 Europe Y Y Y \n174 Europe Y Y Y \n179 Europe Y Y \n187 Europe Y Y Y \n190 Europe Y Y Y \n191 Europe Y Y Y \n220 North America Y ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>ctry_code</th>\n <th>iso_alpha3</th>\n <th>st3_name</th>\n <th>organisation_flag</th>\n <th>continent</th>\n <th>eu_member</th>\n <th>epo_member</th>\n <th>oecd_member</th>\n <th>discontinued</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>11</th>\n <td>AT</td>\n <td>AUT</td>\n <td>Austria</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>18</th>\n <td>BE</td>\n <td>BEL</td>\n <td>Belgium</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>20</th>\n <td>BG</td>\n <td>BGR</td>\n <td>Bulgaria</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <th>41</th>\n <td>CH</td>\n <td>CHE</td>\n <td>Switzerland</td>\n <td></td>\n <td>Europe</td>\n <td></td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>53</th>\n <td>CY</td>\n <td>CYP</td>\n <td>Cyprus</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <th>54</th>\n <td>CZ</td>\n <td>CZE</td>\n <td>Czechia</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>56</th>\n <td>DE</td>\n <td>DEU</td>\n <td>Germany</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>58</th>\n <td>DK</td>\n <td>DNK</td>\n <td>Denmark</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>65</th>\n <td>EE</td>\n <td>EST</td>\n <td>Estonia</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>71</th>\n <td>ES</td>\n <td>ESP</td>\n <td>Spain</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>74</th>\n <td>FI</td>\n <td>FIN</td>\n <td>Finland</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>78</th>\n <td>FR</td>\n <td>FRA</td>\n <td>France</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>91</th>\n <td>GR</td>\n <td>GRC</td>\n <td>Greece</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>98</th>\n <td>HR</td>\n <td>HRV</td>\n <td>Croatia</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <th>100</th>\n <td>HU</td>\n <td>HUN</td>\n <td>Hungary</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>103</th>\n <td>IE</td>\n <td>IRL</td>\n <td>Ireland</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>110</th>\n <td>IT</td>\n <td>ITA</td>\n <td>Italy</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>133</th>\n <td>LT</td>\n <td>LTU</td>\n <td>Lithuania</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>134</th>\n <td>LU</td>\n <td>LUX</td>\n <td>Luxembourg</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>135</th>\n <td>LV</td>\n <td>LVA</td>\n <td>Latvia</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>150</th>\n <td>MT</td>\n <td>MLT</td>\n <td>Malta</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <th>161</th>\n <td>NL</td>\n <td>NLD</td>\n <td>Netherlands</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>162</th>\n <td>NO</td>\n <td>NOR</td>\n <td>Norway</td>\n <td></td>\n <td>Europe</td>\n <td></td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>173</th>\n <td>PL</td>\n <td>POL</td>\n <td>Poland</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>174</th>\n <td>PT</td>\n <td>PRT</td>\n <td>Portugal</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>179</th>\n <td>RO</td>\n <td>ROU</td>\n <td>Romania</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <th>187</th>\n <td>SE</td>\n <td>SWE</td>\n <td>Sweden</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>190</th>\n <td>SI</td>\n <td>SVN</td>\n <td>Slovenia</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>191</th>\n <td>SK</td>\n <td>SVK</td>\n <td>Slovakia</td>\n <td></td>\n <td>Europe</td>\n <td>Y</td>\n <td>Y</td>\n <td>Y</td>\n <td></td>\n </tr>\n <tr>\n <th>220</th>\n <td>US</td>\n <td>USA</td>\n <td>United States of America</td>\n <td></td>\n <td>North America</td>\n <td></td>\n <td></td>\n <td>Y</td>\n <td></td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"western_df = tls_801[((tls_801.eu_member==\"Y\")|\n",
" (tls_801.ctry_code == 'US')|\n",
" (tls_801.ctry_code == 'NO')|\n",
" (tls_801.ctry_code == 'CH')|\n",
" (tls_801.ctry_code == 'UK'))].compute()\n",
"western_df"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "['CN',\n 'HK',\n 'MO',\n 'AT',\n 'BE',\n 'BG',\n 'CH',\n 'CY',\n 'CZ',\n 'DE',\n 'DK',\n 'EE',\n 'ES',\n 'FI',\n 'FR',\n 'GR',\n 'HR',\n 'HU',\n 'IE',\n 'IT',\n 'LT',\n 'LU',\n 'LV',\n 'MT',\n 'NL',\n 'NO',\n 'PL',\n 'PT',\n 'RO',\n 'SE',\n 'SI',\n 'SK',\n 'US']"
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ctry_list=list(china_df[\"ctry_code\"])+list(western_df[\"ctry_code\"])\n",
"ctry_list"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: total: 15.6 ms\n",
"Wall time: 6 ms\n"
]
}
],
"source": [
"%%time\n",
"tls_appln_interval = tls_201_p[((tls_201_p[\"appln_filing_year\"]>2010)&\n",
" (tls_201_p[\"appln_filing_year\"]<2025))][\"appln_id\"].unique()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"tls_206_p_subgroup = tls_206_p[tls_206_p[\"person_ctry_code\"].isin(ctry_list)][[\"person_id\",\"person_ctry_code\"]]"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: total: 45min 6s\n",
"Wall time: 11min 58s\n"
]
}
],
"source": [
"%%time\n",
"appln_pers = (tls_207_p[tls_207_p['appln_id'].isin(tls_appln_interval.compute())]\n",
" ).merge(\n",
" tls_206_p_subgroup,\n",
" on=\"person_id\",how=\"inner\")[[\"appln_id\",\"person_id\",\"person_ctry_code\"]].drop_duplicates()\n",
"\n",
"appln_pers = appln_pers[appln_pers[\"person_ctry_code\"].isin(ctry_list)].drop_duplicates()\n",
"appln_pers.to_parquet(\"appln_pers.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "Index(['appln_id', 'person_id', 'person_ctry_code'], dtype='object')"
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"appln_pers.columns"
]
},
{
"cell_type": "code",
"execution_count": 32,
"outputs": [
{
"data": {
"text/plain": "(Delayed('int-8f92fe68-e5a1-4f9e-a30c-77833e0fff26'), 3)"
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"appln_pers.shape"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[1;31mKeyboardInterrupt\u001B[0m Traceback (most recent call last)",
"Cell \u001B[1;32mIn[34], line 1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m \u001B[43mappln_pers\u001B[49m\u001B[43m[\u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mappln_id\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m]\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mnunique\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcompute\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\dask\\base.py:314\u001B[0m, in \u001B[0;36mDaskMethodsMixin.compute\u001B[1;34m(self, **kwargs)\u001B[0m\n\u001B[0;32m 290\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mcompute\u001B[39m(\u001B[38;5;28mself\u001B[39m, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs):\n\u001B[0;32m 291\u001B[0m \u001B[38;5;250m \u001B[39m\u001B[38;5;124;03m\"\"\"Compute this dask collection\u001B[39;00m\n\u001B[0;32m 292\u001B[0m \n\u001B[0;32m 293\u001B[0m \u001B[38;5;124;03m This turns a lazy Dask collection into its in-memory equivalent.\u001B[39;00m\n\u001B[1;32m (...)\u001B[0m\n\u001B[0;32m 312\u001B[0m \u001B[38;5;124;03m dask.base.compute\u001B[39;00m\n\u001B[0;32m 313\u001B[0m \u001B[38;5;124;03m \"\"\"\u001B[39;00m\n\u001B[1;32m--> 314\u001B[0m (result,) \u001B[38;5;241m=\u001B[39m compute(\u001B[38;5;28mself\u001B[39m, traverse\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mFalse\u001B[39;00m, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n\u001B[0;32m 315\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m result\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\dask\\base.py:599\u001B[0m, in \u001B[0;36mcompute\u001B[1;34m(traverse, optimize_graph, scheduler, get, *args, **kwargs)\u001B[0m\n\u001B[0;32m 596\u001B[0m keys\u001B[38;5;241m.\u001B[39mappend(x\u001B[38;5;241m.\u001B[39m__dask_keys__())\n\u001B[0;32m 597\u001B[0m postcomputes\u001B[38;5;241m.\u001B[39mappend(x\u001B[38;5;241m.\u001B[39m__dask_postcompute__())\n\u001B[1;32m--> 599\u001B[0m results \u001B[38;5;241m=\u001B[39m schedule(dsk, keys, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n\u001B[0;32m 600\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m repack([f(r, \u001B[38;5;241m*\u001B[39ma) \u001B[38;5;28;01mfor\u001B[39;00m r, (f, a) \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mzip\u001B[39m(results, postcomputes)])\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\dask\\threaded.py:89\u001B[0m, in \u001B[0;36mget\u001B[1;34m(dsk, keys, cache, num_workers, pool, **kwargs)\u001B[0m\n\u001B[0;32m 86\u001B[0m \u001B[38;5;28;01melif\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(pool, multiprocessing\u001B[38;5;241m.\u001B[39mpool\u001B[38;5;241m.\u001B[39mPool):\n\u001B[0;32m 87\u001B[0m pool \u001B[38;5;241m=\u001B[39m MultiprocessingPoolExecutor(pool)\n\u001B[1;32m---> 89\u001B[0m results \u001B[38;5;241m=\u001B[39m get_async(\n\u001B[0;32m 90\u001B[0m pool\u001B[38;5;241m.\u001B[39msubmit,\n\u001B[0;32m 91\u001B[0m pool\u001B[38;5;241m.\u001B[39m_max_workers,\n\u001B[0;32m 92\u001B[0m dsk,\n\u001B[0;32m 93\u001B[0m keys,\n\u001B[0;32m 94\u001B[0m cache\u001B[38;5;241m=\u001B[39mcache,\n\u001B[0;32m 95\u001B[0m get_id\u001B[38;5;241m=\u001B[39m_thread_get_id,\n\u001B[0;32m 96\u001B[0m pack_exception\u001B[38;5;241m=\u001B[39mpack_exception,\n\u001B[0;32m 97\u001B[0m \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs,\n\u001B[0;32m 98\u001B[0m )\n\u001B[0;32m 100\u001B[0m \u001B[38;5;66;03m# Cleanup pools associated to dead threads\u001B[39;00m\n\u001B[0;32m 101\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m pools_lock:\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\dask\\local.py:500\u001B[0m, in \u001B[0;36mget_async\u001B[1;34m(submit, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, chunksize, **kwargs)\u001B[0m\n\u001B[0;32m 498\u001B[0m \u001B[38;5;28;01mwhile\u001B[39;00m state[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mwaiting\u001B[39m\u001B[38;5;124m\"\u001B[39m] \u001B[38;5;129;01mor\u001B[39;00m state[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mready\u001B[39m\u001B[38;5;124m\"\u001B[39m] \u001B[38;5;129;01mor\u001B[39;00m state[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mrunning\u001B[39m\u001B[38;5;124m\"\u001B[39m]:\n\u001B[0;32m 499\u001B[0m fire_tasks(chunksize)\n\u001B[1;32m--> 500\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m key, res_info, failed \u001B[38;5;129;01min\u001B[39;00m \u001B[43mqueue_get\u001B[49m\u001B[43m(\u001B[49m\u001B[43mqueue\u001B[49m\u001B[43m)\u001B[49m\u001B[38;5;241m.\u001B[39mresult():\n\u001B[0;32m 501\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m failed:\n\u001B[0;32m 502\u001B[0m exc, tb \u001B[38;5;241m=\u001B[39m loads(res_info)\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\dask\\local.py:130\u001B[0m, in \u001B[0;36mqueue_get\u001B[1;34m(q)\u001B[0m\n\u001B[0;32m 128\u001B[0m \u001B[38;5;28;01mwhile\u001B[39;00m \u001B[38;5;28;01mTrue\u001B[39;00m:\n\u001B[0;32m 129\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[1;32m--> 130\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mq\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget\u001B[49m\u001B[43m(\u001B[49m\u001B[43mblock\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;28;43;01mTrue\u001B[39;49;00m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mtimeout\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;241;43m0.1\u001B[39;49m\u001B[43m)\u001B[49m\n\u001B[0;32m 131\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m Empty:\n\u001B[0;32m 132\u001B[0m \u001B[38;5;28;01mpass\u001B[39;00m\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\queue.py:180\u001B[0m, in \u001B[0;36mQueue.get\u001B[1;34m(self, block, timeout)\u001B[0m\n\u001B[0;32m 178\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m remaining \u001B[38;5;241m<\u001B[39m\u001B[38;5;241m=\u001B[39m \u001B[38;5;241m0.0\u001B[39m:\n\u001B[0;32m 179\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m Empty\n\u001B[1;32m--> 180\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mnot_empty\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mwait\u001B[49m\u001B[43m(\u001B[49m\u001B[43mremaining\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 181\u001B[0m item \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_get()\n\u001B[0;32m 182\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mnot_full\u001B[38;5;241m.\u001B[39mnotify()\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\threading.py:316\u001B[0m, in \u001B[0;36mCondition.wait\u001B[1;34m(self, timeout)\u001B[0m\n\u001B[0;32m 314\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m 315\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m timeout \u001B[38;5;241m>\u001B[39m \u001B[38;5;241m0\u001B[39m:\n\u001B[1;32m--> 316\u001B[0m gotit \u001B[38;5;241m=\u001B[39m \u001B[43mwaiter\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43macquire\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43;01mTrue\u001B[39;49;00m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mtimeout\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 317\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m 318\u001B[0m gotit \u001B[38;5;241m=\u001B[39m waiter\u001B[38;5;241m.\u001B[39macquire(\u001B[38;5;28;01mFalse\u001B[39;00m)\n",
"\u001B[1;31mKeyboardInterrupt\u001B[0m: "
]
}
],
"source": [
"appln_pers[\"appln_id\"].nunique().compute()"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"id_selector = dd.read_parquet(\"appln_pers.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": " appln_id person_id person_ctry_code\n0 515465980 76083836 CH\n1 530145641 76089672 AT\n2 532157050 76089962 CN\n3 532162718 76089962 CN\n4 532164120 76089962 CN",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>appln_id</th>\n <th>person_id</th>\n <th>person_ctry_code</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>515465980</td>\n <td>76083836</td>\n <td>CH</td>\n </tr>\n <tr>\n <th>1</th>\n <td>530145641</td>\n <td>76089672</td>\n <td>AT</td>\n </tr>\n <tr>\n <th>2</th>\n <td>532157050</td>\n <td>76089962</td>\n <td>CN</td>\n </tr>\n <tr>\n <th>3</th>\n <td>532162718</td>\n <td>76089962</td>\n <td>CN</td>\n </tr>\n <tr>\n <th>4</th>\n <td>532164120</td>\n <td>76089962</td>\n <td>CN</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"id_selector.head()"
]
},
{
"cell_type": "code",
"execution_count": 50,
"outputs": [
{
"data": {
"text/plain": "33"
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"id_selector.person_ctry_code.nunique().compute()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 52,
"outputs": [
{
"data": {
"text/plain": "person_ctry_code\nUS 16249179\nDE 4980374\nCN 4137237\nFR 1953182\nCH 819720\nNL 712685\nIT 659643\nSE 575791\nES 404734\nAT 341489\nBE 324202\nFI 308472\nDK 267490\nPL 256066\nNO 134220\nIE 123974\nCZ 120706\nHU 55884\nHK 51785\nRO 49656\nPT 44892\nLU 39736\nGR 29126\nSK 28316\nSI 27382\nBG 13125\nLV 10131\nEE 9364\nLT 8910\nHR 6540\nCY 6232\nMT 6021\nMO 1703\nName: count, dtype: int64"
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"id_selector.person_ctry_code.value_counts().compute()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 52,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: total: 29.2 s\n",
"Wall time: 29.6 s\n"
]
}
],
"source": [
"%%time\n",
"eu_id = id_selector[id_selector[\"person_ctry_code\"].isin(list(western_df[\"ctry_code\"]))][\"appln_id\"].unique()\n",
"china_id = id_selector[id_selector[\"person_ctry_code\"].isin(list(china_df[\"ctry_code\"]))][\"appln_id\"].unique()\n",
" \n",
"common_id = id_selector[id_selector[\"appln_id\"].isin(eu_id.compute())&\n",
" id_selector[\"appln_id\"].isin(china_id.compute())]\n",
"\n",
"common_id.to_parquet(\"common_id_CHEU.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: total: 5.62 s\n",
"Wall time: 5.61 s\n"
]
},
{
"data": {
"text/plain": "203873"
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"common_id[\"appln_id\"].nunique().compute()"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"filtered_ids = dd.read_parquet(\"common_id_CHEU.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": " appln_id person_id person_ctry_code\n102 531966031 76189975 CN\n104 533342168 76190204 CN\n117 504910639 76196951 DE\n144 537370384 76217739 AT\n160 527648360 76227149 ES",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>appln_id</th>\n <th>person_id</th>\n <th>person_ctry_code</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>102</th>\n <td>531966031</td>\n <td>76189975</td>\n <td>CN</td>\n </tr>\n <tr>\n <th>104</th>\n <td>533342168</td>\n <td>76190204</td>\n <td>CN</td>\n </tr>\n <tr>\n <th>117</th>\n <td>504910639</td>\n <td>76196951</td>\n <td>DE</td>\n </tr>\n <tr>\n <th>144</th>\n <td>537370384</td>\n <td>76217739</td>\n <td>AT</td>\n </tr>\n <tr>\n <th>160</th>\n <td>527648360</td>\n <td>76227149</td>\n <td>ES</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"filtered_ids.head()"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": " appln_id person_id person_ctry_code\n9469 330225325 429997 CN\n16032 330225325 4555802 DE\n4448 330225397 4849499 US\n19897 330225397 60435 US\n31709 330225397 4284 US\n... ... ... ...\n69073 587679340 18114863 US\n74083 587679340 19330879 CN\n102624 587679340 53002958 US\n73989 587679340 41556280 US\n101114 587679340 48821759 US\n\n[1025221 rows x 3 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>appln_id</th>\n <th>person_id</th>\n <th>person_ctry_code</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>9469</th>\n <td>330225325</td>\n <td>429997</td>\n <td>CN</td>\n </tr>\n <tr>\n <th>16032</th>\n <td>330225325</td>\n <td>4555802</td>\n <td>DE</td>\n </tr>\n <tr>\n <th>4448</th>\n <td>330225397</td>\n <td>4849499</td>\n <td>US</td>\n </tr>\n <tr>\n <th>19897</th>\n <td>330225397</td>\n <td>60435</td>\n <td>US</td>\n </tr>\n <tr>\n <th>31709</th>\n <td>330225397</td>\n <td>4284</td>\n <td>US</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>69073</th>\n <td>587679340</td>\n <td>18114863</td>\n <td>US</td>\n </tr>\n <tr>\n <th>74083</th>\n <td>587679340</td>\n <td>19330879</td>\n <td>CN</td>\n </tr>\n <tr>\n <th>102624</th>\n <td>587679340</td>\n <td>53002958</td>\n <td>US</td>\n </tr>\n <tr>\n <th>73989</th>\n <td>587679340</td>\n <td>41556280</td>\n <td>US</td>\n </tr>\n <tr>\n <th>101114</th>\n <td>587679340</td>\n <td>48821759</td>\n <td>US</td>\n </tr>\n </tbody>\n</table>\n<p>1025221 rows × 3 columns</p>\n</div>"
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"filtered_ids.sort_values(by=\"appln_id\").compute()"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: total: 234 ms\n",
"Wall time: 216 ms\n"
]
}
],
"source": [
"%%time\n",
"id_scope=filtered_ids[\"appln_id\"].unique().compute()\n",
"pers_id_scope=filtered_ids[\"person_id\"].unique().compute()"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"# tls_201_p.head()"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"outdir = \"WESTERN_CH_scope\"\n",
"os.makedirs(outdir, exist_ok=True)"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: total: 4min 42s\n",
"Wall time: 2min 58s\n"
]
}
],
"source": [
"%%time\n",
"#Application data\n",
"tls_201_p = dd.read_parquet(\"tls_201.parquet\")\n",
"tls_201_scope = tls_201_p[tls_201_p['appln_id'].isin(id_scope)]\n",
"tls_201_scope.compute().to_csv(f\"{outdir}/tls_201_scope.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: total: 1min 46s\n",
"Wall time: 31.4 s\n"
]
}
],
"source": [
"%%time\n",
"#Person-appln data\n",
"tls_207_p = dd.read_parquet(\"tls_207.parquet\")\n",
"tls_207_scope = tls_207_p[((tls_207_p['person_id'].isin(pers_id_scope))&\n",
" (tls_207_p['appln_id'].isin(id_scope)))]\n",
"tls_207_scope.compute().to_csv(f\"{outdir}/tls_207_scope.csv\",index=False)"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: total: 4min 59s\n",
"Wall time: 3min 38s\n"
]
}
],
"source": [
"%%time\n",
"#Person data\n",
"tls_206_p = dd.read_parquet(\"tls_206.parquet\")\n",
"tls_206_scope = tls_206_p[tls_206_p['person_id'].isin(pers_id_scope)]\n",
"tls_206_scope.compute().to_csv(f\"{outdir}/tls_206_scope.csv\",index=False)"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: total: 2min 49s\n",
"Wall time: 2min 6s\n"
]
}
],
"source": [
"%%time\n",
"#Application title data\n",
"tls_202_p = dd.read_csv(\"table_tls202.csv\")\n",
"tls_202_scope = tls_202_p[tls_202_p['appln_id'].isin(id_scope)]\n",
"tls_202_scope.compute().to_csv(f\"{outdir}/tls_202_scope.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: total: 3min 17s\n",
"Wall time: 2min 1s\n"
]
}
],
"source": [
"%%time\n",
"#IPC data\n",
"tls_224_p = dd.read_csv(\"table_tls224.csv\")\n",
"tls_224_p_scope = tls_224_p[tls_224_p['appln_id'].isin(id_scope)]\n",
"tls_224_p_scope.compute().to_csv(f\"{outdir}/tls_224_scope.csv\", index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
}
},
"nbformat": 4,
"nbformat_minor": 1
}