{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 125 ms\n", "Wall time: 143 ms\n" ] } ], "source": [ "%%time\n", "import dask" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": "" }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dask.config.set(temporary_directory=r'D:\\PATSTAT\\dask_temp')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": "" }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dask.config.set({'temporary_directory': r'D:\\PATSTAT\\dask_temp'})" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": "{'temporary-directory': 'D:\\\\PATSTAT\\\\dask_temp',\n 'visualization': {'engine': None},\n 'tokenize': {'ensure-deterministic': False},\n 'dataframe': {'backend': 'pandas',\n 'shuffle': {'method': None, 'compression': None},\n 'parquet': {'metadata-task-size-local': 512, 'metadata-task-size-remote': 1},\n 'dtype_backend': 'pandas',\n 'convert_string': False},\n 'array': {'backend': 'numpy',\n 'rechunk': {'method': 'tasks'},\n 'svg': {'size': 120},\n 'slicing': {'split-large-chunks': None}},\n 'optimization': {'annotations': {'fuse': True},\n 'fuse': {'active': None,\n 'ave-width': 1,\n 'max-width': None,\n 'max-height': inf,\n 'max-depth-new-edges': None,\n 'subgraphs': None,\n 'rename-keys': True}}}" }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dask.config.config" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "import dask.dataframe as dd\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\PATSTAT\n", "D:\\PATSTAT\n" ] } ], "source": [ "import os\n", "print(os.getcwd()) # Prints the current working directory\n", "\n", "workdir_path=r\"D:\\PATSTAT\"\n", "os.chdir(workdir_path)\n", "print(os.getcwd())" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 41min 20s\n", "Wall time: 18min 32s\n" ] } ], "source": [ "%%time\n", "tls_201 = dd.read_csv(\"table_tls201.csv\", low_memory=False,dtype={'appln_nr': 'object',\n", " 'appln_nr_original': 'object'})\n", "tls_201.to_parquet(\"tls_201.parquet\")\n", "\n", "tls_206 = dd.read_csv(\"table_tls206.csv\", low_memory=False)\n", "tls_206.to_parquet(\"tls_206.parquet\")\n", "\n", "tls_207 = dd.read_csv(\"table_tls207.csv\", low_memory=False)\n", "tls_207.to_parquet(\"tls_207.parquet\")\n", "\n", "tls_204 = dd.read_csv(\"table_tls204.csv\", low_memory=False)\n", "tls_207.to_parquet(\"tls_204.parquet\")" ] }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": " appln_id appln_auth appln_nr appln_kind appln_filing_date \n0 0 XX None D 9999-12-31 \\\n1 1 EP 103094.0 A 2000-02-15 \n2 2 EP 107845.0 A 1992-12-02 \n3 3 EP 202556.0 A 2000-07-17 \n4 4 EP 300208.0 A 2000-01-13 \n\n appln_filing_year appln_nr_original ipr_type receiving_office \n0 9999 None PI \\\n1 2000 00103094 PI \n2 1992 00107845 PI \n3 2000 00202556 PI \n4 2000 00300208 PI \n\n internat_appln_id ... earliest_publn_date earliest_publn_year \n0 0 ... 9999-12-31 9999 \\\n1 0 ... 2000-09-20 2000 \n2 0 ... 2000-08-02 2000 \n3 0 ... 2001-01-24 2001 \n4 0 ... 2000-07-26 2000 \n\n earliest_pat_publn_id granted docdb_family_id inpadoc_family_id \n0 0 N 0 0 \\\n1 293253293 Y 8554171 1 \n2 301548848 Y 27517085 2 \n3 291964096 N 7915918 3 \n4 292901055 Y 22889365 4 \n\n docdb_family_size nb_citing_docdb_fam nb_applicants nb_inventors \n0 1 0 0 0 \n1 6 79 1 4 \n2 8 60 2 6 \n3 4 22 2 3 \n4 6 27 1 2 \n\n[5 rows x 26 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
appln_idappln_authappln_nrappln_kindappln_filing_dateappln_filing_yearappln_nr_originalipr_typereceiving_officeinternat_appln_id...earliest_publn_dateearliest_publn_yearearliest_pat_publn_idgranteddocdb_family_idinpadoc_family_iddocdb_family_sizenb_citing_docdb_famnb_applicantsnb_inventors
00XXNoneD9999-12-319999NonePI0...9999-12-3199990N001000
11EP103094.0A2000-02-15200000103094PI0...2000-09-202000293253293Y8554171167914
22EP107845.0A1992-12-02199200107845PI0...2000-08-022000301548848Y27517085286026
33EP202556.0A2000-07-17200000202556PI0...2001-01-242001291964096N7915918342223
44EP300208.0A2000-01-13200000300208PI0...2000-07-262000292901055Y22889365462712
\n

5 rows × 26 columns

\n
" }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Application data\n", "tls_201_p = dd.read_parquet(\"tls_201.parquet\")\n", "tls_201_p.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# tls_201_p[((tls_201_p[\"appln_filing_year\"]>2011)&\n", "# (tls_201_p[\"appln_filing_year\"]<2024)&\n", "# (tls_201_p[\"granted\"]==\"Y\"))][\"appln_id\"].nunique().compute()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# tls_201_p[((tls_201_p[\"appln_filing_year\"]>2011)&\n", "# (tls_201_p[\"appln_filing_year\"]<2024)&\n", "# (tls_201_p[\"granted\"]==\"N\"))][\"appln_id\"].nunique().compute()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": " person_id person_name person_ctry_code\n0 1 Nokia Corporation FI\n1 2 Lipponen, Markku FI\n2 3 Laitinen, Timo FI\n3 4 Aho, Ari FI\n4 5 Knuutila, Jarno FI", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
person_idperson_nameperson_ctry_code
01Nokia CorporationFI
12Lipponen, MarkkuFI
23Laitinen, TimoFI
34Aho, AriFI
45Knuutila, JarnoFI
\n
" }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tls_206_p = dd.read_parquet(\"tls_206.parquet\",columns=[\"person_id\",\"person_name\",\"person_ctry_code\"])\n", "tls_206_p.head()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": " person_id appln_id\n0 1 1\n1 1 7\n2 1 46\n3 1 775\n4 1 1192", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
person_idappln_id
011
117
2146
31775
411192
\n
" }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tls_207_p = dd.read_parquet(\"tls_207.parquet\",columns=[\"person_id\",\"appln_id\"])\n", "tls_207_p.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# tls_207_p[tls_207_p[\"appln_id\"]==1].compute()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": " ctry_code iso_alpha3 st3_name organisation_flag continent \n0 AD AND Andorra Europe \\\n1 AE ARE United Arab Emirates Asia \n2 AF AFG Afghanistan Asia \n3 AG ATG Antigua and Barbuda North America \n4 AI AIA Anguilla North America \n\n eu_member epo_member oecd_member discontinued \n0 \n1 \n2 \n3 \n4 ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ctry_codeiso_alpha3st3_nameorganisation_flagcontinenteu_memberepo_memberoecd_memberdiscontinued
0ADANDAndorraEurope
1AEAREUnited Arab EmiratesAsia
2AFAFGAfghanistanAsia
3AGATGAntigua and BarbudaNorth America
4AIAIAAnguillaNorth America
\n
" }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tls_801 = dd.read_csv(\"table_tls801.csv\", low_memory=False)\n", "tls_801.head()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": " ctry_code iso_alpha3 st3_name organisation_flag continent \n46 CN CHN China Asia \\\n96 HK HKG Hong Kong, China Asia \n146 MO MAC Macao, China Asia \n\n eu_member epo_member oecd_member discontinued \n46 \n96 \n146 ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ctry_codeiso_alpha3st3_nameorganisation_flagcontinenteu_memberepo_memberoecd_memberdiscontinued
46CNCHNChinaAsia
96HKHKGHong Kong, ChinaAsia
146MOMACMacao, ChinaAsia
\n
" }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "china_df = tls_801[((tls_801.st3_name.str.lower().str.contains(\"china\"))&\n", " ~(tls_801.st3_name.str.lower().str.contains(\"taiwan\")))].compute()\n", "china_df" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": " ctry_code iso_alpha3 st3_name organisation_flag continent eu_member \n11 AT AUT Austria Europe Y \\\n18 BE BEL Belgium Europe Y \n20 BG BGR Bulgaria Europe Y \n53 CY CYP Cyprus Europe Y \n54 CZ CZE Czechia Europe Y \n56 DE DEU Germany Europe Y \n58 DK DNK Denmark Europe Y \n65 EE EST Estonia Europe Y \n71 ES ESP Spain Europe Y \n74 FI FIN Finland Europe Y \n78 FR FRA France Europe Y \n91 GR GRC Greece Europe Y \n98 HR HRV Croatia Europe Y \n100 HU HUN Hungary Europe Y \n103 IE IRL Ireland Europe Y \n110 IT ITA Italy Europe Y \n133 LT LTU Lithuania Europe Y \n134 LU LUX Luxembourg Europe Y \n135 LV LVA Latvia Europe Y \n150 MT MLT Malta Europe Y \n161 NL NLD Netherlands Europe Y \n173 PL POL Poland Europe Y \n174 PT PRT Portugal Europe Y \n179 RO ROU Romania Europe Y \n187 SE SWE Sweden Europe Y \n190 SI SVN Slovenia Europe Y \n191 SK SVK Slovakia Europe Y \n\n epo_member oecd_member discontinued \n11 Y Y \n18 Y Y \n20 Y \n53 Y \n54 Y Y \n56 Y Y \n58 Y Y \n65 Y Y \n71 Y Y \n74 Y Y \n78 Y Y \n91 Y Y \n98 Y \n100 Y Y \n103 Y Y \n110 Y Y \n133 Y Y \n134 Y Y \n135 Y Y \n150 Y \n161 Y Y \n173 Y Y \n174 Y Y \n179 Y \n187 Y Y \n190 Y Y \n191 Y Y ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ctry_codeiso_alpha3st3_nameorganisation_flagcontinenteu_memberepo_memberoecd_memberdiscontinued
11ATAUTAustriaEuropeYYY
18BEBELBelgiumEuropeYYY
20BGBGRBulgariaEuropeYY
53CYCYPCyprusEuropeYY
54CZCZECzechiaEuropeYYY
56DEDEUGermanyEuropeYYY
58DKDNKDenmarkEuropeYYY
65EEESTEstoniaEuropeYYY
71ESESPSpainEuropeYYY
74FIFINFinlandEuropeYYY
78FRFRAFranceEuropeYYY
91GRGRCGreeceEuropeYYY
98HRHRVCroatiaEuropeYY
100HUHUNHungaryEuropeYYY
103IEIRLIrelandEuropeYYY
110ITITAItalyEuropeYYY
133LTLTULithuaniaEuropeYYY
134LULUXLuxembourgEuropeYYY
135LVLVALatviaEuropeYYY
150MTMLTMaltaEuropeYY
161NLNLDNetherlandsEuropeYYY
173PLPOLPolandEuropeYYY
174PTPRTPortugalEuropeYYY
179ROROURomaniaEuropeYY
187SESWESwedenEuropeYYY
190SISVNSloveniaEuropeYYY
191SKSVKSlovakiaEuropeYYY
\n
" }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "eu_df = tls_801[tls_801.eu_member==\"Y\"].compute()\n", "eu_df" ] }, { "cell_type": "code", "execution_count": 19, "outputs": [ { "data": { "text/plain": " ctry_code iso_alpha3 st3_name organisation_flag \n11 AT AUT Austria \\\n18 BE BEL Belgium \n20 BG BGR Bulgaria \n41 CH CHE Switzerland \n53 CY CYP Cyprus \n54 CZ CZE Czechia \n56 DE DEU Germany \n58 DK DNK Denmark \n65 EE EST Estonia \n71 ES ESP Spain \n74 FI FIN Finland \n78 FR FRA France \n91 GR GRC Greece \n98 HR HRV Croatia \n100 HU HUN Hungary \n103 IE IRL Ireland \n110 IT ITA Italy \n133 LT LTU Lithuania \n134 LU LUX Luxembourg \n135 LV LVA Latvia \n150 MT MLT Malta \n161 NL NLD Netherlands \n162 NO NOR Norway \n173 PL POL Poland \n174 PT PRT Portugal \n179 RO ROU Romania \n187 SE SWE Sweden \n190 SI SVN Slovenia \n191 SK SVK Slovakia \n220 US USA United States of America \n\n continent eu_member epo_member oecd_member discontinued \n11 Europe Y Y Y \n18 Europe Y Y Y \n20 Europe Y Y \n41 Europe Y Y \n53 Europe Y Y \n54 Europe Y Y Y \n56 Europe Y Y Y \n58 Europe Y Y Y \n65 Europe Y Y Y \n71 Europe Y Y Y \n74 Europe Y Y Y \n78 Europe Y Y Y \n91 Europe Y Y Y \n98 Europe Y Y \n100 Europe Y Y Y \n103 Europe Y Y Y \n110 Europe Y Y Y \n133 Europe Y Y Y \n134 Europe Y Y Y \n135 Europe Y Y Y \n150 Europe Y Y \n161 Europe Y Y Y \n162 Europe Y Y \n173 Europe Y Y Y \n174 Europe Y Y Y \n179 Europe Y Y \n187 Europe Y Y Y \n190 Europe Y Y Y \n191 Europe Y Y Y \n220 North America Y ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ctry_codeiso_alpha3st3_nameorganisation_flagcontinenteu_memberepo_memberoecd_memberdiscontinued
11ATAUTAustriaEuropeYYY
18BEBELBelgiumEuropeYYY
20BGBGRBulgariaEuropeYY
41CHCHESwitzerlandEuropeYY
53CYCYPCyprusEuropeYY
54CZCZECzechiaEuropeYYY
56DEDEUGermanyEuropeYYY
58DKDNKDenmarkEuropeYYY
65EEESTEstoniaEuropeYYY
71ESESPSpainEuropeYYY
74FIFINFinlandEuropeYYY
78FRFRAFranceEuropeYYY
91GRGRCGreeceEuropeYYY
98HRHRVCroatiaEuropeYY
100HUHUNHungaryEuropeYYY
103IEIRLIrelandEuropeYYY
110ITITAItalyEuropeYYY
133LTLTULithuaniaEuropeYYY
134LULUXLuxembourgEuropeYYY
135LVLVALatviaEuropeYYY
150MTMLTMaltaEuropeYY
161NLNLDNetherlandsEuropeYYY
162NONORNorwayEuropeYY
173PLPOLPolandEuropeYYY
174PTPRTPortugalEuropeYYY
179ROROURomaniaEuropeYY
187SESWESwedenEuropeYYY
190SISVNSloveniaEuropeYYY
191SKSVKSlovakiaEuropeYYY
220USUSAUnited States of AmericaNorth AmericaY
\n
" }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "western_df = tls_801[((tls_801.eu_member==\"Y\")|\n", " (tls_801.ctry_code == 'US')|\n", " (tls_801.ctry_code == 'NO')|\n", " (tls_801.ctry_code == 'CH')|\n", " (tls_801.ctry_code == 'UK'))].compute()\n", "western_df" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": "['CN',\n 'HK',\n 'MO',\n 'AT',\n 'BE',\n 'BG',\n 'CH',\n 'CY',\n 'CZ',\n 'DE',\n 'DK',\n 'EE',\n 'ES',\n 'FI',\n 'FR',\n 'GR',\n 'HR',\n 'HU',\n 'IE',\n 'IT',\n 'LT',\n 'LU',\n 'LV',\n 'MT',\n 'NL',\n 'NO',\n 'PL',\n 'PT',\n 'RO',\n 'SE',\n 'SI',\n 'SK',\n 'US']" }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ctry_list=list(china_df[\"ctry_code\"])+list(western_df[\"ctry_code\"])\n", "ctry_list" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 15.6 ms\n", "Wall time: 6 ms\n" ] } ], "source": [ "%%time\n", "tls_appln_interval = tls_201_p[((tls_201_p[\"appln_filing_year\"]>2010)&\n", " (tls_201_p[\"appln_filing_year\"]<2025))][\"appln_id\"].unique()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "tls_206_p_subgroup = tls_206_p[tls_206_p[\"person_ctry_code\"].isin(ctry_list)][[\"person_id\",\"person_ctry_code\"]]" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 45min 6s\n", "Wall time: 11min 58s\n" ] } ], "source": [ "%%time\n", "appln_pers = (tls_207_p[tls_207_p['appln_id'].isin(tls_appln_interval.compute())]\n", " ).merge(\n", " tls_206_p_subgroup,\n", " on=\"person_id\",how=\"inner\")[[\"appln_id\",\"person_id\",\"person_ctry_code\"]].drop_duplicates()\n", "\n", "appln_pers = appln_pers[appln_pers[\"person_ctry_code\"].isin(ctry_list)].drop_duplicates()\n", "appln_pers.to_parquet(\"appln_pers.parquet\")" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": "Index(['appln_id', 'person_id', 'person_ctry_code'], dtype='object')" }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "appln_pers.columns" ] }, { "cell_type": "code", "execution_count": 32, "outputs": [ { "data": { "text/plain": "(Delayed('int-8f92fe68-e5a1-4f9e-a30c-77833e0fff26'), 3)" }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "appln_pers.shape" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", "\u001B[1;31mKeyboardInterrupt\u001B[0m Traceback (most recent call last)", "Cell \u001B[1;32mIn[34], line 1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m \u001B[43mappln_pers\u001B[49m\u001B[43m[\u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mappln_id\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m]\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mnunique\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcompute\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n", "File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\dask\\base.py:314\u001B[0m, in \u001B[0;36mDaskMethodsMixin.compute\u001B[1;34m(self, **kwargs)\u001B[0m\n\u001B[0;32m 290\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mcompute\u001B[39m(\u001B[38;5;28mself\u001B[39m, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs):\n\u001B[0;32m 291\u001B[0m \u001B[38;5;250m \u001B[39m\u001B[38;5;124;03m\"\"\"Compute this dask collection\u001B[39;00m\n\u001B[0;32m 292\u001B[0m \n\u001B[0;32m 293\u001B[0m \u001B[38;5;124;03m This turns a lazy Dask collection into its in-memory equivalent.\u001B[39;00m\n\u001B[1;32m (...)\u001B[0m\n\u001B[0;32m 312\u001B[0m \u001B[38;5;124;03m dask.base.compute\u001B[39;00m\n\u001B[0;32m 313\u001B[0m \u001B[38;5;124;03m \"\"\"\u001B[39;00m\n\u001B[1;32m--> 314\u001B[0m (result,) \u001B[38;5;241m=\u001B[39m compute(\u001B[38;5;28mself\u001B[39m, traverse\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mFalse\u001B[39;00m, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n\u001B[0;32m 315\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m result\n", "File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\dask\\base.py:599\u001B[0m, in \u001B[0;36mcompute\u001B[1;34m(traverse, optimize_graph, scheduler, get, *args, **kwargs)\u001B[0m\n\u001B[0;32m 596\u001B[0m keys\u001B[38;5;241m.\u001B[39mappend(x\u001B[38;5;241m.\u001B[39m__dask_keys__())\n\u001B[0;32m 597\u001B[0m postcomputes\u001B[38;5;241m.\u001B[39mappend(x\u001B[38;5;241m.\u001B[39m__dask_postcompute__())\n\u001B[1;32m--> 599\u001B[0m results \u001B[38;5;241m=\u001B[39m schedule(dsk, keys, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n\u001B[0;32m 600\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m repack([f(r, \u001B[38;5;241m*\u001B[39ma) \u001B[38;5;28;01mfor\u001B[39;00m r, (f, a) \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mzip\u001B[39m(results, postcomputes)])\n", "File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\dask\\threaded.py:89\u001B[0m, in \u001B[0;36mget\u001B[1;34m(dsk, keys, cache, num_workers, pool, **kwargs)\u001B[0m\n\u001B[0;32m 86\u001B[0m \u001B[38;5;28;01melif\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(pool, multiprocessing\u001B[38;5;241m.\u001B[39mpool\u001B[38;5;241m.\u001B[39mPool):\n\u001B[0;32m 87\u001B[0m pool \u001B[38;5;241m=\u001B[39m MultiprocessingPoolExecutor(pool)\n\u001B[1;32m---> 89\u001B[0m results \u001B[38;5;241m=\u001B[39m get_async(\n\u001B[0;32m 90\u001B[0m pool\u001B[38;5;241m.\u001B[39msubmit,\n\u001B[0;32m 91\u001B[0m pool\u001B[38;5;241m.\u001B[39m_max_workers,\n\u001B[0;32m 92\u001B[0m dsk,\n\u001B[0;32m 93\u001B[0m keys,\n\u001B[0;32m 94\u001B[0m cache\u001B[38;5;241m=\u001B[39mcache,\n\u001B[0;32m 95\u001B[0m get_id\u001B[38;5;241m=\u001B[39m_thread_get_id,\n\u001B[0;32m 96\u001B[0m pack_exception\u001B[38;5;241m=\u001B[39mpack_exception,\n\u001B[0;32m 97\u001B[0m \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs,\n\u001B[0;32m 98\u001B[0m )\n\u001B[0;32m 100\u001B[0m \u001B[38;5;66;03m# Cleanup pools associated to dead threads\u001B[39;00m\n\u001B[0;32m 101\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m pools_lock:\n", "File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\dask\\local.py:500\u001B[0m, in \u001B[0;36mget_async\u001B[1;34m(submit, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, chunksize, **kwargs)\u001B[0m\n\u001B[0;32m 498\u001B[0m \u001B[38;5;28;01mwhile\u001B[39;00m state[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mwaiting\u001B[39m\u001B[38;5;124m\"\u001B[39m] \u001B[38;5;129;01mor\u001B[39;00m state[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mready\u001B[39m\u001B[38;5;124m\"\u001B[39m] \u001B[38;5;129;01mor\u001B[39;00m state[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mrunning\u001B[39m\u001B[38;5;124m\"\u001B[39m]:\n\u001B[0;32m 499\u001B[0m fire_tasks(chunksize)\n\u001B[1;32m--> 500\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m key, res_info, failed \u001B[38;5;129;01min\u001B[39;00m \u001B[43mqueue_get\u001B[49m\u001B[43m(\u001B[49m\u001B[43mqueue\u001B[49m\u001B[43m)\u001B[49m\u001B[38;5;241m.\u001B[39mresult():\n\u001B[0;32m 501\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m failed:\n\u001B[0;32m 502\u001B[0m exc, tb \u001B[38;5;241m=\u001B[39m loads(res_info)\n", "File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\dask\\local.py:130\u001B[0m, in \u001B[0;36mqueue_get\u001B[1;34m(q)\u001B[0m\n\u001B[0;32m 128\u001B[0m \u001B[38;5;28;01mwhile\u001B[39;00m \u001B[38;5;28;01mTrue\u001B[39;00m:\n\u001B[0;32m 129\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[1;32m--> 130\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mq\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget\u001B[49m\u001B[43m(\u001B[49m\u001B[43mblock\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;28;43;01mTrue\u001B[39;49;00m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mtimeout\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;241;43m0.1\u001B[39;49m\u001B[43m)\u001B[49m\n\u001B[0;32m 131\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m Empty:\n\u001B[0;32m 132\u001B[0m \u001B[38;5;28;01mpass\u001B[39;00m\n", "File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\queue.py:180\u001B[0m, in \u001B[0;36mQueue.get\u001B[1;34m(self, block, timeout)\u001B[0m\n\u001B[0;32m 178\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m remaining \u001B[38;5;241m<\u001B[39m\u001B[38;5;241m=\u001B[39m \u001B[38;5;241m0.0\u001B[39m:\n\u001B[0;32m 179\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m Empty\n\u001B[1;32m--> 180\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mnot_empty\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mwait\u001B[49m\u001B[43m(\u001B[49m\u001B[43mremaining\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 181\u001B[0m item \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_get()\n\u001B[0;32m 182\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mnot_full\u001B[38;5;241m.\u001B[39mnotify()\n", "File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\threading.py:316\u001B[0m, in \u001B[0;36mCondition.wait\u001B[1;34m(self, timeout)\u001B[0m\n\u001B[0;32m 314\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m 315\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m timeout \u001B[38;5;241m>\u001B[39m \u001B[38;5;241m0\u001B[39m:\n\u001B[1;32m--> 316\u001B[0m gotit \u001B[38;5;241m=\u001B[39m \u001B[43mwaiter\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43macquire\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43;01mTrue\u001B[39;49;00m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mtimeout\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 317\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m 318\u001B[0m gotit \u001B[38;5;241m=\u001B[39m waiter\u001B[38;5;241m.\u001B[39macquire(\u001B[38;5;28;01mFalse\u001B[39;00m)\n", "\u001B[1;31mKeyboardInterrupt\u001B[0m: " ] } ], "source": [ "appln_pers[\"appln_id\"].nunique().compute()" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "id_selector = dd.read_parquet(\"appln_pers.parquet\")" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": " appln_id person_id person_ctry_code\n0 515465980 76083836 CH\n1 530145641 76089672 AT\n2 532157050 76089962 CN\n3 532162718 76089962 CN\n4 532164120 76089962 CN", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
appln_idperson_idperson_ctry_code
051546598076083836CH
153014564176089672AT
253215705076089962CN
353216271876089962CN
453216412076089962CN
\n
" }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "id_selector.head()" ] }, { "cell_type": "code", "execution_count": 50, "outputs": [ { "data": { "text/plain": "33" }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "id_selector.person_ctry_code.nunique().compute()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 52, "outputs": [ { "data": { "text/plain": "person_ctry_code\nUS 16249179\nDE 4980374\nCN 4137237\nFR 1953182\nCH 819720\nNL 712685\nIT 659643\nSE 575791\nES 404734\nAT 341489\nBE 324202\nFI 308472\nDK 267490\nPL 256066\nNO 134220\nIE 123974\nCZ 120706\nHU 55884\nHK 51785\nRO 49656\nPT 44892\nLU 39736\nGR 29126\nSK 28316\nSI 27382\nBG 13125\nLV 10131\nEE 9364\nLT 8910\nHR 6540\nCY 6232\nMT 6021\nMO 1703\nName: count, dtype: int64" }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "id_selector.person_ctry_code.value_counts().compute()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 52, "outputs": [], "source": [], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 29.2 s\n", "Wall time: 29.6 s\n" ] } ], "source": [ "%%time\n", "eu_id = id_selector[id_selector[\"person_ctry_code\"].isin(list(western_df[\"ctry_code\"]))][\"appln_id\"].unique()\n", "china_id = id_selector[id_selector[\"person_ctry_code\"].isin(list(china_df[\"ctry_code\"]))][\"appln_id\"].unique()\n", " \n", "common_id = id_selector[id_selector[\"appln_id\"].isin(eu_id.compute())&\n", " id_selector[\"appln_id\"].isin(china_id.compute())]\n", "\n", "common_id.to_parquet(\"common_id_CHEU.parquet\")" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 5.62 s\n", "Wall time: 5.61 s\n" ] }, { "data": { "text/plain": "203873" }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "common_id[\"appln_id\"].nunique().compute()" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [], "source": [ "filtered_ids = dd.read_parquet(\"common_id_CHEU.parquet\")" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/plain": " appln_id person_id person_ctry_code\n102 531966031 76189975 CN\n104 533342168 76190204 CN\n117 504910639 76196951 DE\n144 537370384 76217739 AT\n160 527648360 76227149 ES", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
appln_idperson_idperson_ctry_code
10253196603176189975CN
10453334216876190204CN
11750491063976196951DE
14453737038476217739AT
16052764836076227149ES
\n
" }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filtered_ids.head()" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/plain": " appln_id person_id person_ctry_code\n9469 330225325 429997 CN\n16032 330225325 4555802 DE\n4448 330225397 4849499 US\n19897 330225397 60435 US\n31709 330225397 4284 US\n... ... ... ...\n69073 587679340 18114863 US\n74083 587679340 19330879 CN\n102624 587679340 53002958 US\n73989 587679340 41556280 US\n101114 587679340 48821759 US\n\n[1025221 rows x 3 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
appln_idperson_idperson_ctry_code
9469330225325429997CN
160323302253254555802DE
44483302253974849499US
1989733022539760435US
317093302253974284US
............
6907358767934018114863US
7408358767934019330879CN
10262458767934053002958US
7398958767934041556280US
10111458767934048821759US
\n

1025221 rows × 3 columns

\n
" }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filtered_ids.sort_values(by=\"appln_id\").compute()" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 234 ms\n", "Wall time: 216 ms\n" ] } ], "source": [ "%%time\n", "id_scope=filtered_ids[\"appln_id\"].unique().compute()\n", "pers_id_scope=filtered_ids[\"person_id\"].unique().compute()" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "# tls_201_p.head()" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "outdir = \"WESTERN_CH_scope\"\n", "os.makedirs(outdir, exist_ok=True)" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 4min 42s\n", "Wall time: 2min 58s\n" ] } ], "source": [ "%%time\n", "#Application data\n", "tls_201_p = dd.read_parquet(\"tls_201.parquet\")\n", "tls_201_scope = tls_201_p[tls_201_p['appln_id'].isin(id_scope)]\n", "tls_201_scope.compute().to_csv(f\"{outdir}/tls_201_scope.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 1min 46s\n", "Wall time: 31.4 s\n" ] } ], "source": [ "%%time\n", "#Person-appln data\n", "tls_207_p = dd.read_parquet(\"tls_207.parquet\")\n", "tls_207_scope = tls_207_p[((tls_207_p['person_id'].isin(pers_id_scope))&\n", " (tls_207_p['appln_id'].isin(id_scope)))]\n", "tls_207_scope.compute().to_csv(f\"{outdir}/tls_207_scope.csv\",index=False)" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 4min 59s\n", "Wall time: 3min 38s\n" ] } ], "source": [ "%%time\n", "#Person data\n", "tls_206_p = dd.read_parquet(\"tls_206.parquet\")\n", "tls_206_scope = tls_206_p[tls_206_p['person_id'].isin(pers_id_scope)]\n", "tls_206_scope.compute().to_csv(f\"{outdir}/tls_206_scope.csv\",index=False)" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 2min 49s\n", "Wall time: 2min 6s\n" ] } ], "source": [ "%%time\n", "#Application title data\n", "tls_202_p = dd.read_csv(\"table_tls202.csv\")\n", "tls_202_scope = tls_202_p[tls_202_p['appln_id'].isin(id_scope)]\n", "tls_202_scope.compute().to_csv(f\"{outdir}/tls_202_scope.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 3min 17s\n", "Wall time: 2min 1s\n" ] } ], "source": [ "%%time\n", "#IPC data\n", "tls_224_p = dd.read_csv(\"table_tls224.csv\")\n", "tls_224_p_scope = tls_224_p[tls_224_p['appln_id'].isin(id_scope)]\n", "tls_224_p_scope.compute().to_csv(f\"{outdir}/tls_224_scope.csv\", index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16" } }, "nbformat": 4, "nbformat_minor": 1 }