{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 109 ms\n", "Wall time: 114 ms\n" ] } ], "source": [ "%%time\n", "import dask" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dask.config.set(temporary_directory=r'D:\\PATSTAT\\dask_temp')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dask.config.set({'temporary_directory': r'D:\\PATSTAT\\dask_temp'})" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'temporary-directory': 'D:\\\\PATSTAT\\\\dask_temp',\n", " 'visualization': {'engine': None},\n", " 'tokenize': {'ensure-deterministic': False},\n", " 'dataframe': {'backend': 'pandas',\n", " 'shuffle': {'method': None, 'compression': None},\n", " 'parquet': {'metadata-task-size-local': 512, 'metadata-task-size-remote': 1},\n", " 'dtype_backend': 'pandas',\n", " 'convert_string': False},\n", " 'array': {'backend': 'numpy',\n", " 'rechunk': {'method': 'tasks'},\n", " 'svg': {'size': 120},\n", " 'slicing': {'split-large-chunks': None}},\n", " 'optimization': {'annotations': {'fuse': True},\n", " 'fuse': {'active': None,\n", " 'ave-width': 1,\n", " 'max-width': None,\n", " 'max-height': inf,\n", " 'max-depth-new-edges': None,\n", " 'subgraphs': None,\n", " 'rename-keys': True}}}" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dask.config.config" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "import dask.dataframe as dd\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\PATSTAT\n", "D:\\PATSTAT\n" ] } ], "source": [ "import os\n", "print(os.getcwd()) # Prints the current working directory\n", "\n", "workdir_path=r\"D:\\PATSTAT\"\n", "os.chdir(workdir_path)\n", "print(os.getcwd())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# tls_201 = dd.read_csv(\"table_tls201.csv\", low_memory=False,dtype={'appln_nr': 'object',\n", "# 'appln_nr_original': 'object'})\n", "# tls_201.head()\n", "# tls_206 = dd.read_csv(\"table_tls206.csv\", low_memory=False)\n", "# tls_206.head()\n", "# tls_207 = dd.read_csv(\"table_tls207.csv\", low_memory=False)\n", "# tls_207.head()\n", "# tls_207.to_parquet(\"tls_207.parquet\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
appln_idappln_authappln_nrappln_kindappln_filing_dateappln_filing_yearappln_nr_originalipr_typereceiving_officeinternat_appln_id...earliest_publn_dateearliest_publn_yearearliest_pat_publn_idgranteddocdb_family_idinpadoc_family_iddocdb_family_sizenb_citing_docdb_famnb_applicantsnb_inventors
00XXNoneD9999-12-319999NonePI0...9999-12-3199990N001000
11EP103094.0A2000-02-15200000103094PI0...2000-09-202000293253293Y8554171167914
22EP107845.0A1992-12-02199200107845PI0...2000-08-022000301548848Y27517085285626
33EP202556.0A2000-07-17200000202556PI0...2001-01-242001291964096N7915918342223
44EP300208.0A2000-01-13200000300208PI0...2000-07-262000292901055Y22889365462712
\n", "

5 rows × 26 columns

\n", "
" ], "text/plain": [ " appln_id appln_auth appln_nr appln_kind appln_filing_date \n", "0 0 XX None D 9999-12-31 \\\n", "1 1 EP 103094.0 A 2000-02-15 \n", "2 2 EP 107845.0 A 1992-12-02 \n", "3 3 EP 202556.0 A 2000-07-17 \n", "4 4 EP 300208.0 A 2000-01-13 \n", "\n", " appln_filing_year appln_nr_original ipr_type receiving_office \n", "0 9999 None PI \\\n", "1 2000 00103094 PI \n", "2 1992 00107845 PI \n", "3 2000 00202556 PI \n", "4 2000 00300208 PI \n", "\n", " internat_appln_id ... earliest_publn_date earliest_publn_year \n", "0 0 ... 9999-12-31 9999 \\\n", "1 0 ... 2000-09-20 2000 \n", "2 0 ... 2000-08-02 2000 \n", "3 0 ... 2001-01-24 2001 \n", "4 0 ... 2000-07-26 2000 \n", "\n", " earliest_pat_publn_id granted docdb_family_id inpadoc_family_id \n", "0 0 N 0 0 \\\n", "1 293253293 Y 8554171 1 \n", "2 301548848 Y 27517085 2 \n", "3 291964096 N 7915918 3 \n", "4 292901055 Y 22889365 4 \n", "\n", " docdb_family_size nb_citing_docdb_fam nb_applicants nb_inventors \n", "0 1 0 0 0 \n", "1 6 79 1 4 \n", "2 8 56 2 6 \n", "3 4 22 2 3 \n", "4 6 27 1 2 \n", "\n", "[5 rows x 26 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Application data\n", "tls_201_p = dd.read_parquet(\"tls_201.parquet\")\n", "tls_201_p.head()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# tls_201_p[((tls_201_p[\"appln_filing_year\"]>2011)&\n", "# (tls_201_p[\"appln_filing_year\"]<2024)&\n", "# (tls_201_p[\"granted\"]==\"Y\"))][\"appln_id\"].nunique().compute()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# tls_201_p[((tls_201_p[\"appln_filing_year\"]>2011)&\n", "# (tls_201_p[\"appln_filing_year\"]<2024)&\n", "# (tls_201_p[\"granted\"]==\"N\"))][\"appln_id\"].nunique().compute()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
person_idperson_nameperson_ctry_code
01Nokia CorporationFI
12Lipponen, MarkkuFI
23Laitinen, TimoFI
34Aho, AriFI
45Knuutila, JarnoFI
\n", "
" ], "text/plain": [ " person_id person_name person_ctry_code\n", "0 1 Nokia Corporation FI\n", "1 2 Lipponen, Markku FI\n", "2 3 Laitinen, Timo FI\n", "3 4 Aho, Ari FI\n", "4 5 Knuutila, Jarno FI" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tls_206_p = dd.read_parquet(\"tls_206.parquet\",columns=[\"person_id\",\"person_name\",\"person_ctry_code\"])\n", "tls_206_p.head()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
person_idappln_id
011
117
2146
31775
411192
\n", "
" ], "text/plain": [ " person_id appln_id\n", "0 1 1\n", "1 1 7\n", "2 1 46\n", "3 1 775\n", "4 1 1192" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tls_207_p = dd.read_parquet(\"tls_207.parquet\",columns=[\"person_id\",\"appln_id\"])\n", "tls_207_p.head()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# tls_207_p[tls_207_p[\"appln_id\"]==1].compute()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ctry_codeiso_alpha3st3_nameorganisation_flagcontinenteu_memberepo_memberoecd_memberdiscontinued
0unknownYNaN
1ADANDAndorraEurope
2AEAREUnited Arab EmiratesAsia
3AFAFGAfghanistanAsia
4AGATGAntigua and BarbudaNorth America
\n", "
" ], "text/plain": [ " ctry_code iso_alpha3 st3_name organisation_flag continent \n", "0 unknown Y NaN \\\n", "1 AD AND Andorra Europe \n", "2 AE ARE United Arab Emirates Asia \n", "3 AF AFG Afghanistan Asia \n", "4 AG ATG Antigua and Barbuda North America \n", "\n", " eu_member epo_member oecd_member discontinued \n", "0 \n", "1 \n", "2 \n", "3 \n", "4 " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tls_801 = dd.read_csv(\"table_tls801.csv\", low_memory=False)\n", "tls_801.head()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ctry_codeiso_alpha3st3_nameorganisation_flagcontinenteu_memberepo_memberoecd_memberdiscontinued
47CNCHNChinaAsia
97HKHKGHong Kong, ChinaAsia
147MOMACMacao SAR (China)Asia
217TWTWNTaiwan Province Of ChinaAsia
\n", "
" ], "text/plain": [ " ctry_code iso_alpha3 st3_name organisation_flag \n", "47 CN CHN China \\\n", "97 HK HKG Hong Kong, China \n", "147 MO MAC Macao SAR (China) \n", "217 TW TWN Taiwan Province Of China \n", "\n", " continent eu_member epo_member oecd_member discontinued \n", "47 Asia \n", "97 Asia \n", "147 Asia \n", "217 Asia " ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "china_df = tls_801[tls_801.st3_name.str.lower().str.contains(\"china\")].compute()\n", "china_df" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ctry_codeiso_alpha3st3_nameorganisation_flagcontinenteu_memberepo_memberoecd_memberdiscontinued
12ATAUTAustriaEuropeYYY
19BEBELBelgiumEuropeYYY
21BGBGRBulgariaEuropeYY
54CYCYPCyprusEuropeYY
55CZCZECzechiaEuropeYYY
57DEDEUGermanyEuropeYYY
59DKDNKDenmarkEuropeYYY
66EEESTEstoniaEuropeYYY
72ESESPSpainEuropeYYY
75FIFINFinlandEuropeYYY
79FRFRAFranceEuropeYYY
92GRGRCGreeceEuropeYYY
99HRHRVCroatiaEuropeYY
101HUHUNHungaryEuropeYYY
104IEIRLIrelandEuropeYYY
111ITITAItalyEuropeYYY
134LTLTULithuaniaEuropeYYY
135LULUXLuxembourgEuropeYYY
136LVLVALatviaEuropeYYY
151MTMLTMaltaEuropeYY
162NLNLDNetherlandsEuropeYYY
174PLPOLPolandEuropeYYY
175PTPRTPortugalEuropeYYY
180ROROURomaniaEuropeYY
188SESWESwedenEuropeYYY
191SISVNSloveniaEuropeYYY
192SKSVKSlovakiaEuropeYYY
\n", "
" ], "text/plain": [ " ctry_code iso_alpha3 st3_name organisation_flag continent eu_member \n", "12 AT AUT Austria Europe Y \\\n", "19 BE BEL Belgium Europe Y \n", "21 BG BGR Bulgaria Europe Y \n", "54 CY CYP Cyprus Europe Y \n", "55 CZ CZE Czechia Europe Y \n", "57 DE DEU Germany Europe Y \n", "59 DK DNK Denmark Europe Y \n", "66 EE EST Estonia Europe Y \n", "72 ES ESP Spain Europe Y \n", "75 FI FIN Finland Europe Y \n", "79 FR FRA France Europe Y \n", "92 GR GRC Greece Europe Y \n", "99 HR HRV Croatia Europe Y \n", "101 HU HUN Hungary Europe Y \n", "104 IE IRL Ireland Europe Y \n", "111 IT ITA Italy Europe Y \n", "134 LT LTU Lithuania Europe Y \n", "135 LU LUX Luxembourg Europe Y \n", "136 LV LVA Latvia Europe Y \n", "151 MT MLT Malta Europe Y \n", "162 NL NLD Netherlands Europe Y \n", "174 PL POL Poland Europe Y \n", "175 PT PRT Portugal Europe Y \n", "180 RO ROU Romania Europe Y \n", "188 SE SWE Sweden Europe Y \n", "191 SI SVN Slovenia Europe Y \n", "192 SK SVK Slovakia Europe Y \n", "\n", " epo_member oecd_member discontinued \n", "12 Y Y \n", "19 Y Y \n", "21 Y \n", "54 Y \n", "55 Y Y \n", "57 Y Y \n", "59 Y Y \n", "66 Y Y \n", "72 Y Y \n", "75 Y Y \n", "79 Y Y \n", "92 Y Y \n", "99 Y \n", "101 Y Y \n", "104 Y Y \n", "111 Y Y \n", "134 Y Y \n", "135 Y Y \n", "136 Y Y \n", "151 Y \n", "162 Y Y \n", "174 Y Y \n", "175 Y Y \n", "180 Y \n", "188 Y Y \n", "191 Y Y \n", "192 Y Y " ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "eu_df = tls_801[tls_801.eu_member==\"Y\"].compute()\n", "eu_df" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "ctry_list=list(china_df[\"ctry_code\"])+list(eu_df[\"ctry_code\"])" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 0 ns\n", "Wall time: 7.98 ms\n" ] } ], "source": [ "%%time\n", "tls_appln_interval = tls_201_p[((tls_201_p[\"appln_filing_year\"]>2011)&\n", " (tls_201_p[\"appln_filing_year\"]<2024)&\n", " (tls_201_p[\"granted\"]==\"Y\"))][\"appln_id\"].unique()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "tls_206_p_subgroup = tls_206_p[tls_206_p[\"person_ctry_code\"].isin(ctry_list)][[\"person_id\",\"person_ctry_code\"]]" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 44.7 s\n", "Wall time: 25.5 s\n" ] } ], "source": [ "%%time\n", "appln_pers = (tls_207_p[tls_207_p['appln_id'].isin(tls_appln_interval.compute())]\n", " ).merge(\n", " tls_206_p_subgroup,\n", " on=\"person_id\",how=\"inner\")[[\"appln_id\",\"person_id\",\"person_ctry_code\"]].drop_duplicates()\n", "\n", "appln_pers = appln_pers[appln_pers[\"person_ctry_code\"].isin(ctry_list)].drop_duplicates()\n" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['appln_id', 'person_id', 'person_ctry_code'], dtype='object')" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "appln_pers.columns" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "scrolled": false }, "outputs": [], "source": [ "appln_pers.to_parquet(\"appln_pers.parquet\")" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2156886" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "appln_pers[\"appln_id\"].nunique().compute()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "id_selector = dd.read_parquet(\"appln_pers.parquet\")" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
appln_idperson_idperson_ctry_code
051139748563445371BE
145118394763448688BG
247796274263474274CN
348608826663484131CN
448808516963484131CN
\n", "
" ], "text/plain": [ " appln_id person_id person_ctry_code\n", "0 511397485 63445371 BE\n", "1 451183947 63448688 BG\n", "2 477962742 63474274 CN\n", "3 486088266 63484131 CN\n", "4 488085169 63484131 CN" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "id_selector.head()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 3.91 s\n", "Wall time: 3.94 s\n" ] } ], "source": [ "%%time\n", "eu_id = id_selector[id_selector[\"person_ctry_code\"].isin(list(eu_df[\"ctry_code\"]))][\"appln_id\"].unique()\n", "china_id = id_selector[id_selector[\"person_ctry_code\"].isin(list(china_df[\"ctry_code\"]))][\"appln_id\"].unique()\n", " \n", "common_id = id_selector[id_selector[\"appln_id\"].isin(eu_id.compute())&\n", " id_selector[\"appln_id\"].isin(china_id.compute())]\n", "\n", "common_id.to_parquet(\"common_id_CHEU.parquet\")" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 734 ms\n", "Wall time: 730 ms\n" ] }, { "data": { "text/plain": [ "21420" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "common_id[\"appln_id\"].nunique().compute()" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "filtered_ids = dd.read_parquet(\"common_id_CHEU.parquet\")" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
appln_idperson_idperson_ctry_code
5945695266663672357CN
7945937785563692722CN
9944524872063758608CN
16341276750763911789CN
31248656787864332187NL
\n", "
" ], "text/plain": [ " appln_id person_id person_ctry_code\n", "59 456952666 63672357 CN\n", "79 459377855 63692722 CN\n", "99 445248720 63758608 CN\n", "163 412767507 63911789 CN\n", "312 486567878 64332187 NL" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filtered_ids.head()" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
appln_idperson_idperson_ctry_code
55143406570365111182DK
1619034065703649115429CN
2182434065703653334372DK
1888134098241049119386CN
54453409824101949211DK
............
1581057539955240791395SE
1770457539955249019064SE
1214657539955242040993CN
63705754066087075122DE
1182757540660818911286CN
\n", "

87653 rows × 3 columns

\n", "
" ], "text/plain": [ " appln_id person_id person_ctry_code\n", "5514 340657036 5111182 DK\n", "16190 340657036 49115429 CN\n", "21824 340657036 53334372 DK\n", "18881 340982410 49119386 CN\n", "5445 340982410 1949211 DK\n", "... ... ... ...\n", "15810 575399552 40791395 SE\n", "17704 575399552 49019064 SE\n", "12146 575399552 42040993 CN\n", "6370 575406608 7075122 DE\n", "11827 575406608 18911286 CN\n", "\n", "[87653 rows x 3 columns]" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filtered_ids.sort_values(by=\"appln_id\").compute()" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 15.6 ms\n", "Wall time: 31.1 ms\n" ] } ], "source": [ "%%time\n", "id_scope=filtered_ids[\"appln_id\"].unique().compute()\n", "pers_id_scope=filtered_ids[\"person_id\"].unique().compute()" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
appln_idappln_authappln_nrappln_kindappln_filing_dateappln_filing_yearappln_nr_originalipr_typereceiving_officeinternat_appln_id...earliest_publn_dateearliest_publn_yearearliest_pat_publn_idgranteddocdb_family_idinpadoc_family_iddocdb_family_sizenb_citing_docdb_famnb_applicantsnb_inventors
00XXNoneD9999-12-319999NonePI0...9999-12-3199990N001000
11EP103094.0A2000-02-15200000103094PI0...2000-09-202000293253293Y8554171167914
22EP107845.0A1992-12-02199200107845PI0...2000-08-022000301548848Y27517085285626
33EP202556.0A2000-07-17200000202556PI0...2001-01-242001291964096N7915918342223
44EP300208.0A2000-01-13200000300208PI0...2000-07-262000292901055Y22889365462712
\n", "

5 rows × 26 columns

\n", "
" ], "text/plain": [ " appln_id appln_auth appln_nr appln_kind appln_filing_date \n", "0 0 XX None D 9999-12-31 \\\n", "1 1 EP 103094.0 A 2000-02-15 \n", "2 2 EP 107845.0 A 1992-12-02 \n", "3 3 EP 202556.0 A 2000-07-17 \n", "4 4 EP 300208.0 A 2000-01-13 \n", "\n", " appln_filing_year appln_nr_original ipr_type receiving_office \n", "0 9999 None PI \\\n", "1 2000 00103094 PI \n", "2 1992 00107845 PI \n", "3 2000 00202556 PI \n", "4 2000 00300208 PI \n", "\n", " internat_appln_id ... earliest_publn_date earliest_publn_year \n", "0 0 ... 9999-12-31 9999 \\\n", "1 0 ... 2000-09-20 2000 \n", "2 0 ... 2000-08-02 2000 \n", "3 0 ... 2001-01-24 2001 \n", "4 0 ... 2000-07-26 2000 \n", "\n", " earliest_pat_publn_id granted docdb_family_id inpadoc_family_id \n", "0 0 N 0 0 \\\n", "1 293253293 Y 8554171 1 \n", "2 301548848 Y 27517085 2 \n", "3 291964096 N 7915918 3 \n", "4 292901055 Y 22889365 4 \n", "\n", " docdb_family_size nb_citing_docdb_fam nb_applicants nb_inventors \n", "0 1 0 0 0 \n", "1 6 79 1 4 \n", "2 8 56 2 6 \n", "3 4 22 2 3 \n", "4 6 27 1 2 \n", "\n", "[5 rows x 26 columns]" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tls_201_p.head()" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "outdir = \"EU_CH_scope\"\n", "os.makedirs(outdir, exist_ok=True)" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 3min 53s\n", "Wall time: 2min 28s\n" ] } ], "source": [ "%%time\n", "#Application data\n", "tls_201_p = dd.read_parquet(\"tls_201.parquet\")\n", "tls_201_scope = tls_201_p[tls_201_p['appln_id'].isin(id_scope)]\n", "tls_201_scope.compute().to_csv(f\"{outdir}/tls_201_scope.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 1min 12s\n", "Wall time: 24.3 s\n" ] } ], "source": [ "%%time\n", "#Person-appln data\n", "tls_207_p = dd.read_parquet(\"tls_207.parquet\")\n", "tls_207_scope = tls_207_p[((tls_207_p['person_id'].isin(pers_id_scope))&\n", " (tls_207_p['appln_id'].isin(id_scope)))]\n", "tls_207_scope.compute().to_csv(f\"{outdir}/tls_207_scope.csv\",index=False)" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 4min 10s\n", "Wall time: 3min 1s\n" ] } ], "source": [ "%%time\n", "#Person data\n", "tls_206_p = dd.read_parquet(\"tls_206.parquet\")\n", "tls_206_scope = tls_206_p[tls_206_p['person_id'].isin(pers_id_scope)]\n", "tls_206_scope.compute().to_csv(f\"{outdir}/tls_206_scope.csv\",index=False)" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 2min 5s\n", "Wall time: 1min 26s\n" ] } ], "source": [ "%%time\n", "#Application title data\n", "tls_202_p = dd.read_csv(\"table_tls202.csv\")\n", "tls_202_scope = tls_202_p[tls_202_p['appln_id'].isin(id_scope)]\n", "tls_202_scope.compute().to_csv(f\"{outdir}/tls_202_scope.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 2min 15s\n", "Wall time: 1min 11s\n" ] } ], "source": [ "%%time\n", "#IPC data\n", "tls_224_p = dd.read_csv(\"table_tls224.csv\")\n", "tls_224_p_scope = tls_224_p[tls_224_p['appln_id'].isin(id_scope)]\n", "tls_224_p_scope.compute().to_csv(f\"{outdir}/tls_224_scope.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 1.97 s\n", "Wall time: 2.66 s\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
appln_idappln_title_lgappln_title
01enMethod and means for using additional cards in...
12enProduction of anti-self antibodies from antibo...
23enScintillation radiation detector
34enWire bonding to copper
45enMethod of manufacturing electrodes for thin fi...
56enSystem for automatically routing calls to call...
67enMULTILAYER TELECOMMUNICATIONS NETWORK
78enEXHAUST GAS PURIFYING CATALYST COMPOUND, CATAL...
89enSPRAYHEAD WITH NOZZLES MADE BY BORING
910enDiphosphines
\n", "
" ], "text/plain": [ " appln_id appln_title_lg appln_title\n", "0 1 en Method and means for using additional cards in...\n", "1 2 en Production of anti-self antibodies from antibo...\n", "2 3 en Scintillation radiation detector\n", "3 4 en Wire bonding to copper\n", "4 5 en Method of manufacturing electrodes for thin fi...\n", "5 6 en System for automatically routing calls to call...\n", "6 7 en MULTILAYER TELECOMMUNICATIONS NETWORK\n", "7 8 en EXHAUST GAS PURIFYING CATALYST COMPOUND, CATAL...\n", "8 9 en SPRAYHEAD WITH NOZZLES MADE BY BORING\n", "9 10 en Diphosphines" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "tls_202 = dd.read_csv(\"table_tls202.csv\", low_memory=False)\n", "tls_202.head(10)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
appln_idtech_rel_appln_id
0831308284
115948940218
215951019180
315951022388
415951022511
\n", "
" ], "text/plain": [ " appln_id tech_rel_appln_id\n", "0 8 31308284\n", "1 159 48940218\n", "2 159 51019180\n", "3 159 51022388\n", "4 159 51022511" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tls_205 = dd.read_csv(\"table_tls205.csv\", low_memory=False)\n", "tls_205.head()" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "# Countries" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 COMPANY\n", "1 NaN\n", "2 GOV NON-PROFIT\n", "3 INDIVIDUAL\n", "4 UNIVERSITY\n", "5 UNKNOWN\n", "6 GOV NON-PROFIT UNIVERSITY\n", "7 COMPANY GOV NON-PROFIT\n", "8 HOSPITAL\n", "9 COMPANY HOSPITAL\n", "10 COMPANY UNIVERSITY\n", "11 COMPANY GOV NON-PROFIT UNIVERSITY\n", "12 COMPANY INDIVIDUAL\n", "13 COMPANY GOV NON-PROFIT \n", "14 GOV NON-PROFIT HOSPITAL\n", "Name: psn_sector, dtype: object" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tls_206[\"psn_sector\"].unique().compute()" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
person_orig_idperson_idsourcesource_versionname_freeformperson_name_orig_lglast_namefirst_namemiddle_nameaddress_freeform...address_3address_4address_5streetcityzip_codestateperson_ctry_coderesidence_ctry_coderole
011EPREGNaNNokia CorporationNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNFINaN
122EPREGNaNLipponen, MarkkuNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNFINaN
233EPREGNaNLaitinen, TimoNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNFINaN
344EPREGNaNAho, AriNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNFINaN
455EPREGNaNKnuutila, JarnoNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNFINaN
\n", "

5 rows × 22 columns

\n", "
" ], "text/plain": [ " person_orig_id person_id source source_version name_freeform \n", "0 1 1 EPREG NaN Nokia Corporation \\\n", "1 2 2 EPREG NaN Lipponen, Markku \n", "2 3 3 EPREG NaN Laitinen, Timo \n", "3 4 4 EPREG NaN Aho, Ari \n", "4 5 5 EPREG NaN Knuutila, Jarno \n", "\n", " person_name_orig_lg last_name first_name middle_name address_freeform \n", "0 NaN NaN NaN NaN NaN \\\n", "1 NaN NaN NaN NaN NaN \n", "2 NaN NaN NaN NaN NaN \n", "3 NaN NaN NaN NaN NaN \n", "4 NaN NaN NaN NaN NaN \n", "\n", " ... address_3 address_4 address_5 street city zip_code state \n", "0 ... NaN NaN NaN NaN NaN NaN \\\n", "1 ... NaN NaN NaN NaN NaN NaN \n", "2 ... NaN NaN NaN NaN NaN NaN \n", "3 ... NaN NaN NaN NaN NaN NaN \n", "4 ... NaN NaN NaN NaN NaN NaN \n", "\n", " person_ctry_code residence_ctry_code role \n", "0 FI NaN \n", "1 FI NaN \n", "2 FI NaN \n", "3 FI NaN \n", "4 FI NaN \n", "\n", "[5 rows x 22 columns]" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tls_226 = dd.read_csv(\"table_tls226.csv\", low_memory=False)\n", "tls_226.head()" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "137" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tls_226.npartitions" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
appln_idnat_class_symbol
0379100G2J JGFG JGFG
1448592NOT CLASSIFIED
2448593NOT CLASSIFIED
3448594NOT CLASSIFIED
4448595NOT CLASSIFIED
\n", "
" ], "text/plain": [ " appln_id nat_class_symbol\n", "0 379100 G2J JGFG JGFG\n", "1 448592 NOT CLASSIFIED \n", "2 448593 NOT CLASSIFIED \n", "3 448594 NOT CLASSIFIED \n", "4 448595 NOT CLASSIFIED " ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tls_210 = dd.read_csv(\"table_tls210.csv\", low_memory=False)\n", "tls_210.head()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "df = tls_210[tls_210.appln_id == 448594] # Select a subsection\n", "result = df#.groupby('id').value.mean() # Reduce to a smaller size\n", "result = result.compute()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
appln_idnat_class_symbol
3448594NOT CLASSIFIED
\n", "
" ], "text/plain": [ " appln_id nat_class_symbol\n", "3 448594 NOT CLASSIFIED " ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16" } }, "nbformat": 4, "nbformat_minor": 1 }