Compare commits

..

No commits in common. 'main' and 'utku_keyword_suggestion' have entirely different histories.

26
.gitignore vendored

@ -1,26 +1,2 @@
/PATSTAT/EU_CH_scope/cpc_defs.csv
/misc_code/ /misc_code/
/PATSTAT/appln_data.xlsx
/PATSTAT/person_data.xlsx
/PPT/~$ZSI ReConnect China Report.pptx
/WOS/wos_extract/wos_downloads/
/WOS/wos_extract/wos_downloads/
/WOS/kw_token_ranked_bibliometrics_okset.xlsx
/PATSTAT/first_round/tls_206_scope.csv
/PATSTAT/first_round/first-filings-with-persons-raw.csv
/PATSTAT/raw_files_csv/first-filings-with-IPC-raw.csv
/PATSTAT/first_round/first-filings-raw.csv
/PATSTAT/raw_files_csv/01_patent_subset.csv
/PATSTAT/raw_files_csv/04_persons_2011_2022_inv_pairs.csv
/PATSTAT/raw_files_csv/03_persons_2011_2022_collected.csv
/PATSTAT/raw_files_csv/02_persons_2011_2022.csv
/PATSTAT/first_round/2IPCGreenInventoryList_2022WIPO.xlsx
/PATSTAT/DS/ds1.png
/PATSTAT/ds1.xlsx
/PATSTAT/DS/ds2.png
/PATSTAT/ds2.xlsx
/PATSTAT/ds3.xlsx
/PATSTAT/DS/ds3.png
/PATSTAT/first_round/IPC_green_codes.csv
/PATSTAT/first_round/IPC_green_codes.xlsx
/ZSI_previous_analyses/ReConnect China_preliminary_report.docx
/PATSTAT/first_round/tls_206_scope_v2.csv

@ -1,12 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="DataSourceManagerImpl" format="xml" multifile-model="true">
<data-source source="LOCAL" name="MOME_Adam@dataw16.mome.local" read-only="true" uuid="389ec0ff-653b-4714-abd4-4f8bc9c6a8f3">
<driver-ref>sqlserver.ms</driver-ref>
<synchronize>true</synchronize>
<jdbc-driver>com.microsoft.sqlserver.jdbc.SQLServerDriver</jdbc-driver>
<jdbc-url>jdbc:sqlserver://dataw16.mome.local:14333;database=MOME_Adam</jdbc-url>
<working-dir>$ProjectFileDir$</working-dir>
</data-source>
</component>
</project>

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:e6617cfd30c21c45e7c484067afb13c4b70d923242c718e67f3f4cdcf09b88a9
3 size 159305338

BIN
PATSTAT/CPC_data/cpc_defs.csv (Stored with Git LFS)

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
3 size 159308737

BIN
PATSTAT/CPC_data/scope_cpc_defs.csv (Stored with Git LFS)

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:76479394378a76774399904f0aa8104a2fdf0d2ec39d22a928a0c07eb80e6e0c
3 size 209293

BIN
PATSTAT/EU_CH_scope/table_tls801.csv (Stored with Git LFS)

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
3 size 13988

BIN
PATSTAT/EU_CH_scope/tls_201_scope.csv (Stored with Git LFS)

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
3 size 3267628

BIN
PATSTAT/EU_CH_scope/tls_202_scope.csv (Stored with Git LFS)

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
3 size 1623737

BIN
PATSTAT/EU_CH_scope/tls_206_scope.csv (Stored with Git LFS)

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
3 size 6809037

BIN
PATSTAT/EU_CH_scope/tls_207_scope.csv (Stored with Git LFS)

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
3 size 2002469

BIN
PATSTAT/EU_CH_scope/tls_224_scope.csv (Stored with Git LFS)

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
3 size 4334296

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:4869b7a8b2f6ca8df1ac495eb3ff2f572cc5c41339076e7bdb40262b74eea7f7
3 size 13988

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:3264fda505d2a4728fbc4ad7adea0203c9b7d14538b16d5c6493a4002bfc56e5
3 size 14231

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:bf5ccd7434bb1049e1ee6899203730979ca728a177f4f99daf9bb0f4b6d5dd20
3 size 3267628

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:9c0a38a6ca3e3cbc33cf41be88a9c1b1f644497008d970e506b59cfa09223a77
3 size 1623737

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:ebb998b094a1665a9f648443e8b2e0fdddd26a4e00b7920916edc281e5b05c45
3 size 6809037

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:2ab4c724251ab75ffe34101d2201dcabe187238d8638f23731fc9b29b4add8ff
3 size 2002469

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:9ffe1e6be5b893477be98d01c168ee97f12fe1ce80bdc32f4673f4dc84a273fd
3 size 4334296

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:3264fda505d2a4728fbc4ad7adea0203c9b7d14538b16d5c6493a4002bfc56e5
3 size 14231

BIN
PATSTAT/WESTERN_CH_scope/cpc_defs.csv (Stored with Git LFS)

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:e6617cfd30c21c45e7c484067afb13c4b70d923242c718e67f3f4cdcf09b88a9
3 size 159305338

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:cfab497f8744acc33258f7617ae7b8a276da7c29f848f62798450e503f4ed95f
3 size 14146

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:ae9939bc1427d6b394c4e8527b7befc5da0970bf4306daea09a6fe7de232cb87
3 size 31252973

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:2357701683dfeba8f8ef22f77405ea882f2e47827304fd3090cb1ecdedfb2e01
3 size 15421027

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:370c4a514f7023d15319c6e34273bd81a1136d99d39f54bd5f18936c7f966cf7
3 size 55399448

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:8924d85e95fe5041394179e0edf6bc2b8ecf840e0c0fbbe80625856b2e50bfb6
3 size 24489484

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:46b0dd86dd4b7064405d0b165254ea09959cda8537a1fa2e78453f2cbd4ec707
3 size 44129929

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:34ee5594aef056982a92766572fc7a54f62ccb06f632d0f4e6f34c567bafefc7
3 size 215

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:8a7715595b37193690d811cc0e6a644bc02baa5f1c66de9946b5cf970d3f2445
3 size 263

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:5e129abbc9f43bbde7ac41ecb9a8267b3ceebe70954db47eef13995a80a9aae6
3 size 4918

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:a53c696ee6a84b32b75264db6ff0985ac434791793f96287e3bf3af9246225f3
3 size 5110

@ -0,0 +1,402 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 143,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import pyarrow\n",
"from tqdm import tqdm\n",
"import os\n",
"import shutil"
]
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [
{
"data": {
"text/plain": "'2.0.0rc1'"
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.__version__"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [
{
"ename": "OptionError",
"evalue": "'You can only set the value of existing options'",
"output_type": "error",
"traceback": [
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[1;31mOptionError\u001B[0m Traceback (most recent call last)",
"Cell \u001B[1;32mIn[12], line 1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m \u001B[43mpd\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43moptions\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mmode\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mdtype_backend\u001B[49m \u001B[38;5;241m=\u001B[39m \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mpyarrow\u001B[39m\u001B[38;5;124m'\u001B[39m\n",
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\pandas\\_config\\config.py:226\u001B[0m, in \u001B[0;36mDictWrapper.__setattr__\u001B[1;34m(self, key, val)\u001B[0m\n\u001B[0;32m 224\u001B[0m _set_option(prefix, val)\n\u001B[0;32m 225\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m--> 226\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m OptionError(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mYou can only set the value of existing options\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n",
"\u001B[1;31mOptionError\u001B[0m: 'You can only set the value of existing options'"
]
}
],
"source": [
"# pd.options.mode #= 'pyarrow'"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 119,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_01\n",
"D:\\PATSTAT\n"
]
}
],
"source": [
"import os\n",
"print(os.getcwd()) # Prints the current working directory\n",
"\n",
"workdir_path=r\"D:\\PATSTAT\"\n",
"os.chdir(workdir_path)\n",
"print(os.getcwd())"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 148,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CONCAT: ['tls201_part01.csv', 'tls201_part02.csv', 'tls201_part03.csv'] TO D:\\PATSTAT\\table_tls201.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"1000it [07:12, 2.31it/s]\n",
"1000it [07:25, 2.25it/s]\n",
"429it [02:55, 2.44it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"deleting\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_01\\tls201_part01.csv\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_01\\tls201_part02.csv\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_01\\tls201_part03.csv\n",
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_01\\tls202_part01.csv TO D:\\PATSTAT\\table_tls202.csv\n",
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_05\\tls204_part01\\tls204_part01.csv TO D:\\PATSTAT\\table_tls204.csv\n",
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_05\\tls205_part01\\tls205_part01.csv TO D:\\PATSTAT\\table_tls205.csv\n",
"CONCAT: ['tls206_part01.csv', 'tls206_part02.csv'] TO D:\\PATSTAT\\table_tls206.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"1000it [07:21, 2.27it/s]\n",
"715it [05:32, 2.15it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"deleting\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_05\\tls206_part01\\tls206_part01.csv\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_05\\tls206_part02\\tls206_part02.csv\n",
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_05\\tls207_part01\\tls207_part01.csv TO D:\\PATSTAT\\table_tls207.csv\n",
"CONCAT: ['tls209_part01.csv', 'tls209_part02.csv'] TO D:\\PATSTAT\\table_tls209.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"3200it [07:35, 7.02it/s]\n",
"3049it [07:15, 7.01it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"deleting\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_06\\tls209_part01\\tls209_part01.csv\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_06\\tls209_part02\\tls209_part02.csv\n",
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_06\\tls210_part01\\tls210_part01.csv TO D:\\PATSTAT\\table_tls210.csv\n",
"CONCAT: ['tls211_part01.csv', 'tls211_part02.csv'] TO D:\\PATSTAT\\table_tls211.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2500it [08:45, 4.76it/s]\n",
"369it [01:23, 4.42it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"deleting\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_06\\tls211_part01\\tls211_part01.csv\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_06\\tls211_part02\\tls211_part02.csv\n",
"CONCAT: ['tls212_part01.csv', 'tls212_part02.csv', 'tls212_part03.csv'] TO D:\\PATSTAT\\table_tls212.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"4000it [12:45, 5.23it/s]\n",
"4000it [12:57, 5.14it/s]\n",
"1232it [03:54, 5.25it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"deleting\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_06\\tls212_part01\\tls212_part01.csv\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_07\\tls212_part02\\tls212_part02.csv\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_07\\tls212_part03\\tls212_part03.csv\n",
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_07\\tls214_part01\\tls214_part01.csv TO D:\\PATSTAT\\table_tls214.csv\n",
"CONCAT: ['tls215_part01.csv', 'tls215_part02.csv', 'tls215_part03.csv'] TO D:\\PATSTAT\\table_tls215.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"8000it [15:10, 8.79it/s]\n",
"8000it [14:20, 9.30it/s]\n",
"1294it [02:10, 9.95it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"deleting\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_08\\tls215_part01\\tls215_part01.csv\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_08\\tls215_part02\\tls215_part02.csv\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_08\\tls215_part03\\tls215_part03.csv\n",
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_08\\tls216_part01\\tls216_part01.csv TO D:\\PATSTAT\\table_tls216.csv\n",
"CONCAT: ['tls222_part01.csv', 'tls222_part02.csv'] TO D:\\PATSTAT\\table_tls222.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"5000it [09:11, 9.06it/s]\n",
"2402it [04:24, 9.07it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"deleting\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_08\\tls222_part01\\tls222_part01.csv\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_08\\tls222_part02\\tls222_part02.csv\n",
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_08\\tls223_part01\\tls223_part01.csv TO D:\\PATSTAT\\table_tls223.csv\n",
"CONCAT: ['tls224_part01.csv', 'tls224_part02.csv', 'tls224_part03.csv'] TO D:\\PATSTAT\\table_tls224.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"3000it [04:43, 10.60it/s]\n",
"3000it [04:44, 10.53it/s]\n",
"1131it [01:48, 10.38it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"deleting\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_08\\tls224_part01\\tls224_part01.csv\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_08\\tls224_part02\\tls224_part02.csv\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_08\\tls224_part03\\tls224_part03.csv\n",
"CONCAT: ['tls225_part01.csv', 'tls225_part02.csv'] TO D:\\PATSTAT\\table_tls225.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"3000it [09:19, 5.37it/s]\n",
"529it [01:39, 5.32it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"deleting\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_08\\tls225_part01\\tls225_part01.csv\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_08\\tls225_part02\\tls225_part02.csv\n",
"CONCAT: ['tls226_part01.csv', 'tls226_part02.csv'] TO D:\\PATSTAT\\table_tls226.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"1500it [11:54, 2.10it/s]\n",
"581it [04:50, 2.00it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"deleting\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_09\\tls226_part01\\tls226_part01.csv\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_09\\tls226_part02\\tls226_part02.csv\n",
"CONCAT: ['tls227_part01.csv', 'tls227_part02.csv'] TO D:\\PATSTAT\\table_tls227.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"8000it [11:59, 11.12it/s]\n",
"434it [00:39, 11.09it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"deleting\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_09\\tls227_part01\\tls227_part01.csv\n",
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_09\\tls227_part02\\tls227_part02.csv\n",
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_09\\tls228_part01\\tls228_part01.csv TO D:\\PATSTAT\\table_tls228.csv\n",
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_10\\tls229_part01\\tls229_part01.csv TO D:\\PATSTAT\\table_tls229.csv\n",
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_10\\tls230_part01\\tls230_part01.csv TO D:\\PATSTAT\\table_tls230.csv\n",
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_10\\tls801_part01\\tls801_part01.csv TO D:\\PATSTAT\\table_tls801.csv\n",
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_10\\tls803_part01\\tls803_part01.csv TO D:\\PATSTAT\\table_tls803.csv\n",
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_10\\tls901_part01\\tls901_part01.csv TO D:\\PATSTAT\\table_tls901.csv\n",
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_10\\tls902_part01\\tls902_part01.csv TO D:\\PATSTAT\\table_tls902.csv\n",
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_10\\tls904_part01\\tls904_part01.csv TO D:\\PATSTAT\\table_tls904.csv\n"
]
}
],
"source": [
"file_path_dict=dict()\n",
"\n",
"# iterate over files in\n",
"# that directory\n",
"for root, dirs, files in os.walk(workdir_path):\n",
" for filename in files:\n",
" if filename.endswith(\".csv\") and not filename.startswith(\"table\"):\n",
" path= os.path.join(root, filename)\n",
" file_path_dict[filename] = path\n",
"\n",
"complete_file_set = set()\n",
"for fname in file_path_dict.keys():\n",
" complete_file_set.add(fname.split(\"_\")[0])\n",
"complete_file_set = sorted(complete_file_set)\n",
"\n",
"for complete_file in complete_file_set:\n",
" file_list = [file for file in file_path_dict.keys() if complete_file in file]\n",
"\n",
" outfile_path = os.path.join(workdir_path,\"table_\"+complete_file+\".csv\")\n",
" # print(outfile_path,file_list)\n",
"\n",
" if len(file_list)==1:\n",
" file_path = file_path_dict.get(file_list[0])\n",
" print(\"MOVE:\",file_path ,\"TO\",outfile_path)\n",
" shutil.move(file_path, outfile_path)\n",
" else:\n",
" print(\"CONCAT:\",file_list ,\"TO\",outfile_path)\n",
"\n",
"\n",
" CHUNK_SIZE = 50000\n",
" with_header=True\n",
" first_one = True\n",
" for csv_file_name in file_list:\n",
" csv_file_path = file_path_dict.get(csv_file_name)\n",
" chunk_container = pd.read_csv(csv_file_path, chunksize=CHUNK_SIZE,low_memory=False)\n",
" for chunk in tqdm(chunk_container):\n",
" chunk.to_csv(outfile_path, mode=\"a\", index=False, header=with_header)\n",
" with_header=False\n",
" first_one = False\n",
"\n",
" print(\"deleting\")\n",
" for csv_file_name in file_list:\n",
" csv_file_path = file_path_dict.get(csv_file_name)\n",
" print(csv_file_path)\n",
" os.remove(csv_file_path)"
],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

@ -1,375 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import pyarrow\n",
"from tqdm import tqdm\n",
"import os\n",
"import shutil"
]
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [
{
"data": {
"text/plain": "'2.0.0rc1'"
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.__version__"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\PATSTAT\n",
"D:\\PATSTAT\n"
]
}
],
"source": [
"import os\n",
"print(os.getcwd()) # Prints the current working directory\n",
"\n",
"workdir_path=r\"D:\\PATSTAT\"\n",
"os.chdir(workdir_path)\n",
"print(os.getcwd())"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 3,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CONCAT: ['tls201_part01.csv', 'tls201_part02.csv', 'tls201_part03.csv'] TO D:\\PATSTAT\\table_tls201.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"1000it [08:05, 2.06it/s]\n",
"1000it [08:23, 1.99it/s]\n",
"506it [04:04, 2.07it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"deleting\n",
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_01\\tls201_part01\\tls201_part01.csv\n",
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_01\\tls201_part02\\tls201_part02.csv\n",
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_01\\tls201_part03\\tls201_part03.csv\n",
"MOVE: D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_01\\tls202_part01\\tls202_part01.csv TO D:\\PATSTAT\\table_tls202.csv\n",
"MOVE: D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_05\\tls204_part01\\tls204_part01.csv TO D:\\PATSTAT\\table_tls204.csv\n",
"MOVE: D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_05\\tls205_part01\\tls205_part01.csv TO D:\\PATSTAT\\table_tls205.csv\n",
"CONCAT: ['tls206_part01.csv', 'tls206_part02.csv'] TO D:\\PATSTAT\\table_tls206.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"1000it [08:15, 2.02it/s]\n",
"777it [06:38, 1.95it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"deleting\n",
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_05\\tls206_part01\\tls206_part01.csv\n",
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_06\\tls206_part02\\tls206_part02.csv\n",
"MOVE: D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_06\\tls207_part01\\tls207_part01.csv TO D:\\PATSTAT\\table_tls207.csv\n",
"CONCAT: ['tls209_part01.csv', 'tls209_part02.csv', 'tls209_part03.csv'] TO D:\\PATSTAT\\table_tls209.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"3200it [07:58, 6.69it/s]\n",
"3200it [07:54, 6.75it/s]\n",
"217it [00:31, 6.96it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"deleting\n",
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_06\\tls209_part01\\tls209_part01.csv\n",
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_06\\tls209_part02\\tls209_part02.csv\n",
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_06\\tls209_part03\\tls209_part03.csv\n",
"MOVE: D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_06\\tls210_part01\\tls210_part01.csv TO D:\\PATSTAT\\table_tls210.csv\n",
"CONCAT: ['tls211_part01.csv', 'tls211_part02.csv'] TO D:\\PATSTAT\\table_tls211.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2500it [08:18, 5.01it/s]\n",
"463it [01:43, 4.46it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"deleting\n",
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_07\\tls211_part01\\tls211_part01.csv\n",
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_07\\tls211_part02\\tls211_part02.csv\n",
"CONCAT: ['tls212_part01.csv', 'tls212_part02.csv', 'tls212_part03.csv'] TO D:\\PATSTAT\\table_tls212.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"4000it [10:46, 6.19it/s]\n",
"4000it [10:54, 6.11it/s]\n",
"1764it [04:47, 6.13it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"deleting\n",
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_07\\tls212_part01\\tls212_part01.csv\n",
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_07\\tls212_part02\\tls212_part02.csv\n",
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_08\\tls212_part03\\tls212_part03.csv\n",
"MOVE: D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_08\\tls214_part01\\tls214_part01.csv TO D:\\PATSTAT\\table_tls214.csv\n",
"CONCAT: ['tls215_part01.csv', 'tls215_part02.csv', 'tls215_part03.csv'] TO D:\\PATSTAT\\table_tls215.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"8000it [12:35, 10.60it/s]\n",
"8000it [12:24, 10.74it/s]\n",
"3101it [04:50, 10.66it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"deleting\n",
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_08\\tls215_part01\\tls215_part01.csv\n",
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_08\\tls215_part02\\tls215_part02.csv\n",
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_09\\tls215_part03\\tls215_part03.csv\n",
"MOVE: D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_09\\tls216_part01\\tls216_part01.csv TO D:\\PATSTAT\\table_tls216.csv\n",
"CONCAT: ['tls222_part01.csv', 'tls222_part02.csv'] TO D:\\PATSTAT\\table_tls222.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"5000it [08:34, 9.71it/s]\n",
"2489it [04:16, 9.70it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"deleting\n",
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_09\\tls222_part01\\tls222_part01.csv\n",
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_09\\tls222_part02\\tls222_part02.csv\n",
"CONCAT: ['tls224_part01.csv', 'tls224_part02.csv', 'tls224_part03.csv'] TO D:\\PATSTAT\\table_tls224.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"3000it [04:24, 11.35it/s]\n",
"3000it [04:27, 11.20it/s]\n",
"1442it [02:07, 11.29it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"deleting\n",
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_09\\tls224_part01\\tls224_part01.csv\n",
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_09\\tls224_part02\\tls224_part02.csv\n",
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_09\\tls224_part03\\tls224_part03.csv\n",
"CONCAT: ['tls225_part01.csv', 'tls225_part02.csv'] TO D:\\PATSTAT\\table_tls225.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"3000it [08:29, 5.89it/s]\n",
"718it [02:02, 5.88it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"deleting\n",
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_09\\tls225_part01\\tls225_part01.csv\n",
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_09\\tls225_part02\\tls225_part02.csv\n",
"CONCAT: ['tls226_part01.csv', 'tls226_part02.csv'] TO D:\\PATSTAT\\table_tls226.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"1500it [10:40, 2.34it/s]\n",
"664it [04:56, 2.24it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"deleting\n",
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_09\\tls226_part01\\tls226_part01.csv\n",
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_10\\tls226_part02\\tls226_part02.csv\n",
"CONCAT: ['tls227_part01.csv', 'tls227_part02.csv'] TO D:\\PATSTAT\\table_tls227.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"8000it [12:42, 10.49it/s]\n",
"862it [01:22, 10.39it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"deleting\n",
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_10\\tls227_part01\\tls227_part01.csv\n",
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_10\\tls227_part02\\tls227_part02.csv\n",
"MOVE: D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_10\\tls228_part01\\tls228_part01.csv TO D:\\PATSTAT\\table_tls228.csv\n",
"MOVE: D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_10\\tls229_part01\\tls229_part01.csv TO D:\\PATSTAT\\table_tls229.csv\n",
"MOVE: D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_10\\tls230_part01\\tls230_part01.csv TO D:\\PATSTAT\\table_tls230.csv\n",
"MOVE: D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_11\\tls801_part01\\tls801_part01.csv TO D:\\PATSTAT\\table_tls801.csv\n"
]
}
],
"source": [
"file_path_dict=dict()\n",
"\n",
"# iterate over files in\n",
"# that directory\n",
"for root, dirs, files in os.walk(workdir_path):\n",
" for filename in files:\n",
" if filename.endswith(\".csv\") and not filename.startswith(\"table\"):\n",
" path= os.path.join(root, filename)\n",
" file_path_dict[filename] = path\n",
"\n",
"complete_file_set = set()\n",
"for fname in file_path_dict.keys():\n",
" complete_file_set.add(fname.split(\"_\")[0])\n",
"complete_file_set = sorted(complete_file_set)\n",
"\n",
"for complete_file in complete_file_set:\n",
" file_list = [file for file in file_path_dict.keys() if complete_file in file]\n",
"\n",
" outfile_path = os.path.join(workdir_path,\"table_\"+complete_file+\".csv\")\n",
" # print(outfile_path,file_list)\n",
"\n",
" if len(file_list)==1:\n",
" file_path = file_path_dict.get(file_list[0])\n",
" print(\"MOVE:\",file_path ,\"TO\",outfile_path)\n",
" shutil.move(file_path, outfile_path)\n",
" else:\n",
" print(\"CONCAT:\",file_list ,\"TO\",outfile_path)\n",
"\n",
"\n",
" CHUNK_SIZE = 50000\n",
" with_header = True\n",
" first_one = True\n",
" for csv_file_name in file_list:\n",
" csv_file_path = file_path_dict.get(csv_file_name)\n",
" chunk_container = pd.read_csv(csv_file_path, chunksize=CHUNK_SIZE,low_memory=False)\n",
" for chunk in tqdm(chunk_container):\n",
" chunk.to_csv(outfile_path, mode=\"a\", index=False, header=with_header)\n",
" with_header=False\n",
" first_one = False\n",
"\n",
" print(\"deleting\")\n",
" for csv_file_name in file_list:\n",
" csv_file_path = file_path_dict.get(csv_file_name)\n",
" print(csv_file_path)\n",
" os.remove(csv_file_path)"
],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 988 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 718 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 742 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 653 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 755 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.3 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 536 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 756 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 742 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 764 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 751 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 900 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1010 KiB

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.2 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.0 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 800 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 808 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 762 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 785 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 968 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 747 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 918 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.1 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 683 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 832 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 858 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 598 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 629 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 703 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 716 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 841 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 754 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 726 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 782 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 681 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 673 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 746 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 705 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 807 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 752 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 729 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 761 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 623 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 598 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 626 KiB

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save