{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 125 ms\n", "Wall time: 269 ms\n" ] } ], "source": [ "%%time\n", "import dask" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": "" }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dask.config.set(temporary_directory=r'D:\\PATSTAT\\dask_temp')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": "" }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dask.config.set({'temporary_directory': r'D:\\PATSTAT\\dask_temp'})" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": "{'temporary-directory': 'D:\\\\PATSTAT\\\\dask_temp',\n 'visualization': {'engine': None},\n 'tokenize': {'ensure-deterministic': False},\n 'dataframe': {'backend': 'pandas',\n 'shuffle': {'method': None, 'compression': None},\n 'parquet': {'metadata-task-size-local': 512, 'metadata-task-size-remote': 1},\n 'dtype_backend': 'pandas',\n 'convert_string': False},\n 'array': {'backend': 'numpy',\n 'rechunk': {'method': 'tasks'},\n 'svg': {'size': 120},\n 'slicing': {'split-large-chunks': None}},\n 'optimization': {'annotations': {'fuse': True},\n 'fuse': {'active': None,\n 'ave-width': 1,\n 'max-width': None,\n 'max-height': inf,\n 'max-depth-new-edges': None,\n 'subgraphs': None,\n 'rename-keys': True}}}" }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dask.config.config" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "import dask.dataframe as dd\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\PATSTAT\n", "D:\\PATSTAT\n" ] } ], "source": [ "import os\n", "print(os.getcwd()) # Prints the current working directory\n", "\n", "workdir_path=r\"D:\\PATSTAT\"\n", "os.chdir(workdir_path)\n", "print(os.getcwd())" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# tls_201 = dd.read_csv(\"table_tls201.csv\", low_memory=False,dtype={'appln_nr': 'object',\n", "# 'appln_nr_original': 'object'})\n", "# tls_201.head()\n", "# tls_206 = dd.read_csv(\"table_tls206.csv\", low_memory=False)\n", "# tls_206.head()\n", "# tls_207 = dd.read_csv(\"table_tls207.csv\", low_memory=False)\n", "# tls_207.head()\n", "# tls_207.to_parquet(\"tls_207.parquet\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": " appln_id appln_auth appln_nr appln_kind appln_filing_date \n0 0 XX None D 9999-12-31 \\\n1 1 EP 103094.0 A 2000-02-15 \n2 2 EP 107845.0 A 1992-12-02 \n3 3 EP 202556.0 A 2000-07-17 \n4 4 EP 300208.0 A 2000-01-13 \n\n appln_filing_year appln_nr_original ipr_type receiving_office \n0 9999 None PI \\\n1 2000 00103094 PI \n2 1992 00107845 PI \n3 2000 00202556 PI \n4 2000 00300208 PI \n\n internat_appln_id ... earliest_publn_date earliest_publn_year \n0 0 ... 9999-12-31 9999 \\\n1 0 ... 2000-09-20 2000 \n2 0 ... 2000-08-02 2000 \n3 0 ... 2001-01-24 2001 \n4 0 ... 2000-07-26 2000 \n\n earliest_pat_publn_id granted docdb_family_id inpadoc_family_id \n0 0 N 0 0 \\\n1 293253293 Y 8554171 1 \n2 301548848 Y 27517085 2 \n3 291964096 N 7915918 3 \n4 292901055 Y 22889365 4 \n\n docdb_family_size nb_citing_docdb_fam nb_applicants nb_inventors \n0 1 0 0 0 \n1 6 79 1 4 \n2 8 56 2 6 \n3 4 22 2 3 \n4 6 27 1 2 \n\n[5 rows x 26 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
appln_idappln_authappln_nrappln_kindappln_filing_dateappln_filing_yearappln_nr_originalipr_typereceiving_officeinternat_appln_id...earliest_publn_dateearliest_publn_yearearliest_pat_publn_idgranteddocdb_family_idinpadoc_family_iddocdb_family_sizenb_citing_docdb_famnb_applicantsnb_inventors
00XXNoneD9999-12-319999NonePI0...9999-12-3199990N001000
11EP103094.0A2000-02-15200000103094PI0...2000-09-202000293253293Y8554171167914
22EP107845.0A1992-12-02199200107845PI0...2000-08-022000301548848Y27517085285626
33EP202556.0A2000-07-17200000202556PI0...2001-01-242001291964096N7915918342223
44EP300208.0A2000-01-13200000300208PI0...2000-07-262000292901055Y22889365462712
\n

5 rows × 26 columns

\n
" }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Application data\n", "tls_201_p = dd.read_parquet(\"tls_201.parquet\")\n", "tls_201_p.head()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# tls_201_p[((tls_201_p[\"appln_filing_year\"]>2011)&\n", "# (tls_201_p[\"appln_filing_year\"]<2024)&\n", "# (tls_201_p[\"granted\"]==\"Y\"))][\"appln_id\"].nunique().compute()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# tls_201_p[((tls_201_p[\"appln_filing_year\"]>2011)&\n", "# (tls_201_p[\"appln_filing_year\"]<2024)&\n", "# (tls_201_p[\"granted\"]==\"N\"))][\"appln_id\"].nunique().compute()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": " person_id person_name person_ctry_code\n0 1 Nokia Corporation FI\n1 2 Lipponen, Markku FI\n2 3 Laitinen, Timo FI\n3 4 Aho, Ari FI\n4 5 Knuutila, Jarno FI", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
person_idperson_nameperson_ctry_code
01Nokia CorporationFI
12Lipponen, MarkkuFI
23Laitinen, TimoFI
34Aho, AriFI
45Knuutila, JarnoFI
\n
" }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tls_206_p = dd.read_parquet(\"tls_206.parquet\",columns=[\"person_id\",\"person_name\",\"person_ctry_code\"])\n", "tls_206_p.head()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": " person_id appln_id\n0 1 1\n1 1 7\n2 1 46\n3 1 775\n4 1 1192", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
person_idappln_id
011
117
2146
31775
411192
\n
" }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tls_207_p = dd.read_parquet(\"tls_207.parquet\",columns=[\"person_id\",\"appln_id\"])\n", "tls_207_p.head()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# tls_207_p[tls_207_p[\"appln_id\"]==1].compute()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": " ctry_code iso_alpha3 st3_name organisation_flag continent \n0 unknown Y NaN \\\n1 AD AND Andorra Europe \n2 AE ARE United Arab Emirates Asia \n3 AF AFG Afghanistan Asia \n4 AG ATG Antigua and Barbuda North America \n\n eu_member epo_member oecd_member discontinued \n0 \n1 \n2 \n3 \n4 ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ctry_codeiso_alpha3st3_nameorganisation_flagcontinenteu_memberepo_memberoecd_memberdiscontinued
0unknownYNaN
1ADANDAndorraEurope
2AEAREUnited Arab EmiratesAsia
3AFAFGAfghanistanAsia
4AGATGAntigua and BarbudaNorth America
\n
" }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tls_801 = dd.read_csv(\"table_tls801.csv\", low_memory=False)\n", "tls_801.head()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": " ctry_code iso_alpha3 st3_name organisation_flag \n47 CN CHN China \\\n97 HK HKG Hong Kong, China \n147 MO MAC Macao SAR (China) \n217 TW TWN Taiwan Province Of China \n\n continent eu_member epo_member oecd_member discontinued \n47 Asia \n97 Asia \n147 Asia \n217 Asia ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ctry_codeiso_alpha3st3_nameorganisation_flagcontinenteu_memberepo_memberoecd_memberdiscontinued
47CNCHNChinaAsia
97HKHKGHong Kong, ChinaAsia
147MOMACMacao SAR (China)Asia
217TWTWNTaiwan Province Of ChinaAsia
\n
" }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "china_df = tls_801[tls_801.st3_name.str.lower().str.contains(\"china\")].compute()\n", "china_df" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": " ctry_code iso_alpha3 st3_name organisation_flag continent eu_member \n12 AT AUT Austria Europe Y \\\n19 BE BEL Belgium Europe Y \n21 BG BGR Bulgaria Europe Y \n54 CY CYP Cyprus Europe Y \n55 CZ CZE Czechia Europe Y \n57 DE DEU Germany Europe Y \n59 DK DNK Denmark Europe Y \n66 EE EST Estonia Europe Y \n72 ES ESP Spain Europe Y \n75 FI FIN Finland Europe Y \n79 FR FRA France Europe Y \n92 GR GRC Greece Europe Y \n99 HR HRV Croatia Europe Y \n101 HU HUN Hungary Europe Y \n104 IE IRL Ireland Europe Y \n111 IT ITA Italy Europe Y \n134 LT LTU Lithuania Europe Y \n135 LU LUX Luxembourg Europe Y \n136 LV LVA Latvia Europe Y \n151 MT MLT Malta Europe Y \n162 NL NLD Netherlands Europe Y \n174 PL POL Poland Europe Y \n175 PT PRT Portugal Europe Y \n180 RO ROU Romania Europe Y \n188 SE SWE Sweden Europe Y \n191 SI SVN Slovenia Europe Y \n192 SK SVK Slovakia Europe Y \n\n epo_member oecd_member discontinued \n12 Y Y \n19 Y Y \n21 Y \n54 Y \n55 Y Y \n57 Y Y \n59 Y Y \n66 Y Y \n72 Y Y \n75 Y Y \n79 Y Y \n92 Y Y \n99 Y \n101 Y Y \n104 Y Y \n111 Y Y \n134 Y Y \n135 Y Y \n136 Y Y \n151 Y \n162 Y Y \n174 Y Y \n175 Y Y \n180 Y \n188 Y Y \n191 Y Y \n192 Y Y ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ctry_codeiso_alpha3st3_nameorganisation_flagcontinenteu_memberepo_memberoecd_memberdiscontinued
12ATAUTAustriaEuropeYYY
19BEBELBelgiumEuropeYYY
21BGBGRBulgariaEuropeYY
54CYCYPCyprusEuropeYY
55CZCZECzechiaEuropeYYY
57DEDEUGermanyEuropeYYY
59DKDNKDenmarkEuropeYYY
66EEESTEstoniaEuropeYYY
72ESESPSpainEuropeYYY
75FIFINFinlandEuropeYYY
79FRFRAFranceEuropeYYY
92GRGRCGreeceEuropeYYY
99HRHRVCroatiaEuropeYY
101HUHUNHungaryEuropeYYY
104IEIRLIrelandEuropeYYY
111ITITAItalyEuropeYYY
134LTLTULithuaniaEuropeYYY
135LULUXLuxembourgEuropeYYY
136LVLVALatviaEuropeYYY
151MTMLTMaltaEuropeYY
162NLNLDNetherlandsEuropeYYY
174PLPOLPolandEuropeYYY
175PTPRTPortugalEuropeYYY
180ROROURomaniaEuropeYY
188SESWESwedenEuropeYYY
191SISVNSloveniaEuropeYYY
192SKSVKSlovakiaEuropeYYY
\n
" }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "eu_df = tls_801[tls_801.eu_member==\"Y\"].compute()\n", "eu_df" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "ctry_list=list(china_df[\"ctry_code\"])+list(eu_df[\"ctry_code\"])" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 0 ns\n", "Wall time: 0 ns\n" ] } ], "source": [ "%%time\n", "tls_appln_interval = tls_201_p[((tls_201_p[\"appln_filing_year\"]>2010)&\n", " (tls_201_p[\"appln_filing_year\"]<2024))][\"appln_id\"].unique()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "tls_206_p_subgroup = tls_206_p[tls_206_p[\"person_ctry_code\"].isin(ctry_list)][[\"person_id\",\"person_ctry_code\"]]" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 59.8 s\n", "Wall time: 33.3 s\n" ] } ], "source": [ "%%time\n", "appln_pers = (tls_207_p[tls_207_p['appln_id'].isin(tls_appln_interval.compute())]\n", " ).merge(\n", " tls_206_p_subgroup,\n", " on=\"person_id\",how=\"inner\")[[\"appln_id\",\"person_id\",\"person_ctry_code\"]].drop_duplicates()\n", "\n", "appln_pers = appln_pers[appln_pers[\"person_ctry_code\"].isin(ctry_list)].drop_duplicates()\n" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": "Index(['appln_id', 'person_id', 'person_ctry_code'], dtype='object')" }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "appln_pers.columns" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "scrolled": false }, "outputs": [], "source": [ "appln_pers.to_parquet(\"appln_pers.parquet\")" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": "5221245" }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "appln_pers[\"appln_id\"].nunique().compute()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "id_selector = dd.read_parquet(\"appln_pers.parquet\")" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": " appln_id person_id person_ctry_code\n0 535761830 76992434 DE\n1 529397031 76992470 CN\n2 529397099 76992470 CN\n3 410337532 76993145 DE\n4 528363154 76993242 DE", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
appln_idperson_idperson_ctry_code
053576183076992434DE
152939703176992470CN
252939709976992470CN
341033753276993145DE
452836315476993242DE
\n
" }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "id_selector.head()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 11.6 s\n", "Wall time: 11.8 s\n" ] } ], "source": [ "%%time\n", "eu_id = id_selector[id_selector[\"person_ctry_code\"].isin(list(eu_df[\"ctry_code\"]))][\"appln_id\"].unique()\n", "china_id = id_selector[id_selector[\"person_ctry_code\"].isin(list(china_df[\"ctry_code\"]))][\"appln_id\"].unique()\n", " \n", "common_id = id_selector[id_selector[\"appln_id\"].isin(eu_id.compute())&\n", " id_selector[\"appln_id\"].isin(china_id.compute())]\n", "\n", "common_id.to_parquet(\"common_id_CHEU.parquet\")" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 2.55 s\n", "Wall time: 2.59 s\n" ] }, { "data": { "text/plain": "64266" }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "common_id[\"appln_id\"].nunique().compute()" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "filtered_ids = dd.read_parquet(\"common_id_CHEU.parquet\")" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": " appln_id person_id person_ctry_code\n125 531979430 77025643 CN\n138 533281318 77040504 DK\n139 557203984 77040504 DK\n140 558802816 77040504 DK\n141 575538984 77040504 DK", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
appln_idperson_idperson_ctry_code
12553197943077025643CN
13853328131877040504DK
13955720398477040504DK
14055880281677040504DK
14157553898477040504DK
\n
" }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filtered_ids.head()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": " appln_id person_id person_ctry_code\n22116 330225325 4555802 DE\n18943 330225325 429997 CN\n21222 330322632 4853096 CN\n661 330322632 524 DE\n6945 330322632 4853097 DE\n... ... ... ...\n32498 575556091 13012618 CN\n44415 575556091 53072159 CN\n73579 575556091 85317179 CN\n17265 575556091 12469480 FR\n56058 575556091 53831662 CN\n\n[274001 rows x 3 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
appln_idperson_idperson_ctry_code
221163302253254555802DE
18943330225325429997CN
212223303226324853096CN
661330322632524DE
69453303226324853097DE
............
3249857555609113012618CN
4441557555609153072159CN
7357957555609185317179CN
1726557555609112469480FR
5605857555609153831662CN
\n

274001 rows × 3 columns

\n
" }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filtered_ids.sort_values(by=\"appln_id\").compute()" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 46.9 ms\n", "Wall time: 68.1 ms\n" ] } ], "source": [ "%%time\n", "id_scope=filtered_ids[\"appln_id\"].unique().compute()\n", "pers_id_scope=filtered_ids[\"person_id\"].unique().compute()" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": " appln_id appln_auth appln_nr appln_kind appln_filing_date \n0 0 XX None D 9999-12-31 \\\n1 1 EP 103094.0 A 2000-02-15 \n2 2 EP 107845.0 A 1992-12-02 \n3 3 EP 202556.0 A 2000-07-17 \n4 4 EP 300208.0 A 2000-01-13 \n\n appln_filing_year appln_nr_original ipr_type receiving_office \n0 9999 None PI \\\n1 2000 00103094 PI \n2 1992 00107845 PI \n3 2000 00202556 PI \n4 2000 00300208 PI \n\n internat_appln_id ... earliest_publn_date earliest_publn_year \n0 0 ... 9999-12-31 9999 \\\n1 0 ... 2000-09-20 2000 \n2 0 ... 2000-08-02 2000 \n3 0 ... 2001-01-24 2001 \n4 0 ... 2000-07-26 2000 \n\n earliest_pat_publn_id granted docdb_family_id inpadoc_family_id \n0 0 N 0 0 \\\n1 293253293 Y 8554171 1 \n2 301548848 Y 27517085 2 \n3 291964096 N 7915918 3 \n4 292901055 Y 22889365 4 \n\n docdb_family_size nb_citing_docdb_fam nb_applicants nb_inventors \n0 1 0 0 0 \n1 6 79 1 4 \n2 8 56 2 6 \n3 4 22 2 3 \n4 6 27 1 2 \n\n[5 rows x 26 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
appln_idappln_authappln_nrappln_kindappln_filing_dateappln_filing_yearappln_nr_originalipr_typereceiving_officeinternat_appln_id...earliest_publn_dateearliest_publn_yearearliest_pat_publn_idgranteddocdb_family_idinpadoc_family_iddocdb_family_sizenb_citing_docdb_famnb_applicantsnb_inventors
00XXNoneD9999-12-319999NonePI0...9999-12-3199990N001000
11EP103094.0A2000-02-15200000103094PI0...2000-09-202000293253293Y8554171167914
22EP107845.0A1992-12-02199200107845PI0...2000-08-022000301548848Y27517085285626
33EP202556.0A2000-07-17200000202556PI0...2001-01-242001291964096N7915918342223
44EP300208.0A2000-01-13200000300208PI0...2000-07-262000292901055Y22889365462712
\n

5 rows × 26 columns

\n
" }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tls_201_p.head()" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "outdir = \"EU_CH_scope\"\n", "os.makedirs(outdir, exist_ok=True)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 4min 8s\n", "Wall time: 2min 42s\n" ] } ], "source": [ "%%time\n", "#Application data\n", "tls_201_p = dd.read_parquet(\"tls_201.parquet\")\n", "tls_201_scope = tls_201_p[tls_201_p['appln_id'].isin(id_scope)]\n", "tls_201_scope.compute().to_csv(f\"{outdir}/tls_201_scope.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 1min 15s\n", "Wall time: 23.9 s\n" ] } ], "source": [ "%%time\n", "#Person-appln data\n", "tls_207_p = dd.read_parquet(\"tls_207.parquet\")\n", "tls_207_scope = tls_207_p[((tls_207_p['person_id'].isin(pers_id_scope))&\n", " (tls_207_p['appln_id'].isin(id_scope)))]\n", "tls_207_scope.compute().to_csv(f\"{outdir}/tls_207_scope.csv\",index=False)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 4min 13s\n", "Wall time: 3min 3s\n" ] } ], "source": [ "%%time\n", "#Person data\n", "tls_206_p = dd.read_parquet(\"tls_206.parquet\")\n", "tls_206_scope = tls_206_p[tls_206_p['person_id'].isin(pers_id_scope)]\n", "tls_206_scope.compute().to_csv(f\"{outdir}/tls_206_scope.csv\",index=False)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 2min 8s\n", "Wall time: 1min 29s\n" ] } ], "source": [ "%%time\n", "#Application title data\n", "tls_202_p = dd.read_csv(\"table_tls202.csv\")\n", "tls_202_scope = tls_202_p[tls_202_p['appln_id'].isin(id_scope)]\n", "tls_202_scope.compute().to_csv(f\"{outdir}/tls_202_scope.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 2min 36s\n", "Wall time: 1min 28s\n" ] } ], "source": [ "%%time\n", "#IPC data\n", "tls_224_p = dd.read_csv(\"table_tls224.csv\")\n", "tls_224_p_scope = tls_224_p[tls_224_p['appln_id'].isin(id_scope)]\n", "tls_224_p_scope.compute().to_csv(f\"{outdir}/tls_224_scope.csv\", index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16" } }, "nbformat": 4, "nbformat_minor": 1 }