{ "cells": [ { "cell_type": "code", "execution_count": 143, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import pyarrow\n", "from tqdm import tqdm\n", "import os\n", "import shutil" ] }, { "cell_type": "code", "execution_count": 2, "outputs": [ { "data": { "text/plain": "'2.0.0rc1'" }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.__version__" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 12, "outputs": [ { "ename": "OptionError", "evalue": "'You can only set the value of existing options'", "output_type": "error", "traceback": [ "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", "\u001B[1;31mOptionError\u001B[0m Traceback (most recent call last)", "Cell \u001B[1;32mIn[12], line 1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m \u001B[43mpd\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43moptions\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mmode\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mdtype_backend\u001B[49m \u001B[38;5;241m=\u001B[39m \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mpyarrow\u001B[39m\u001B[38;5;124m'\u001B[39m\n", "File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\pandas\\_config\\config.py:226\u001B[0m, in \u001B[0;36mDictWrapper.__setattr__\u001B[1;34m(self, key, val)\u001B[0m\n\u001B[0;32m 224\u001B[0m _set_option(prefix, val)\n\u001B[0;32m 225\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m--> 226\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m OptionError(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mYou can only set the value of existing options\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n", "\u001B[1;31mOptionError\u001B[0m: 'You can only set the value of existing options'" ] } ], "source": [ "# pd.options.mode #= 'pyarrow'" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 119, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_01\n", "D:\\PATSTAT\n" ] } ], "source": [ "import os\n", "print(os.getcwd()) # Prints the current working directory\n", "\n", "workdir_path=r\"D:\\PATSTAT\"\n", "os.chdir(workdir_path)\n", "print(os.getcwd())" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CONCAT: ['tls201_part01.csv', 'tls201_part02.csv', 'tls201_part03.csv'] TO D:\\PATSTAT\\table_tls201.csv\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "1000it [07:12, 2.31it/s]\n", "1000it [07:25, 2.25it/s]\n", "429it [02:55, 2.44it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "deleting\n", "D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_01\\tls201_part01.csv\n", "D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_01\\tls201_part02.csv\n", "D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_01\\tls201_part03.csv\n", "MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_01\\tls202_part01.csv TO D:\\PATSTAT\\table_tls202.csv\n", "MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_05\\tls204_part01\\tls204_part01.csv TO D:\\PATSTAT\\table_tls204.csv\n", "MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_05\\tls205_part01\\tls205_part01.csv TO D:\\PATSTAT\\table_tls205.csv\n", "CONCAT: ['tls206_part01.csv', 'tls206_part02.csv'] TO D:\\PATSTAT\\table_tls206.csv\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "1000it [07:21, 2.27it/s]\n", "715it [05:32, 2.15it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "deleting\n", "D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_05\\tls206_part01\\tls206_part01.csv\n", "D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_05\\tls206_part02\\tls206_part02.csv\n", "MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_05\\tls207_part01\\tls207_part01.csv TO D:\\PATSTAT\\table_tls207.csv\n", "CONCAT: ['tls209_part01.csv', 'tls209_part02.csv'] TO D:\\PATSTAT\\table_tls209.csv\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "3200it [07:35, 7.02it/s]\n", "3049it [07:15, 7.01it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "deleting\n", "D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_06\\tls209_part01\\tls209_part01.csv\n", "D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_06\\tls209_part02\\tls209_part02.csv\n", "MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_06\\tls210_part01\\tls210_part01.csv TO D:\\PATSTAT\\table_tls210.csv\n", "CONCAT: ['tls211_part01.csv', 'tls211_part02.csv'] TO D:\\PATSTAT\\table_tls211.csv\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2500it [08:45, 4.76it/s]\n", "369it [01:23, 4.42it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "deleting\n", "D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_06\\tls211_part01\\tls211_part01.csv\n", "D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_06\\tls211_part02\\tls211_part02.csv\n", "CONCAT: ['tls212_part01.csv', 'tls212_part02.csv', 'tls212_part03.csv'] TO D:\\PATSTAT\\table_tls212.csv\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "4000it [12:45, 5.23it/s]\n", "4000it [12:57, 5.14it/s]\n", "1232it [03:54, 5.25it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "deleting\n", "D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_06\\tls212_part01\\tls212_part01.csv\n", "D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_07\\tls212_part02\\tls212_part02.csv\n", "D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_07\\tls212_part03\\tls212_part03.csv\n", "MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_07\\tls214_part01\\tls214_part01.csv TO D:\\PATSTAT\\table_tls214.csv\n", "CONCAT: ['tls215_part01.csv', 'tls215_part02.csv', 'tls215_part03.csv'] TO D:\\PATSTAT\\table_tls215.csv\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "8000it [15:10, 8.79it/s]\n", "3524it [06:32, 9.30it/s]" ] } ], "source": [ "file_path_dict=dict()\n", "\n", "# iterate over files in\n", "# that directory\n", "for root, dirs, files in os.walk(workdir_path):\n", " for filename in files:\n", " if filename.endswith(\".csv\") and not filename.startswith(\"table\"):\n", " path= os.path.join(root, filename)\n", " file_path_dict[filename] = path\n", "\n", "complete_file_set = set()\n", "for fname in file_path_dict.keys():\n", " complete_file_set.add(fname.split(\"_\")[0])\n", "complete_file_set = sorted(complete_file_set)\n", "\n", "for complete_file in complete_file_set:\n", " file_list = [file for file in file_path_dict.keys() if complete_file in file]\n", "\n", " outfile_path = os.path.join(workdir_path,\"table_\"+complete_file+\".csv\")\n", " # print(outfile_path,file_list)\n", "\n", " if len(file_list)==1:\n", " file_path = file_path_dict.get(file_list[0])\n", " print(\"MOVE:\",file_path ,\"TO\",outfile_path)\n", " shutil.move(file_path, outfile_path)\n", " else:\n", " print(\"CONCAT:\",file_list ,\"TO\",outfile_path)\n", "\n", "\n", " CHUNK_SIZE = 50000\n", " with_header=True\n", " first_one = True\n", " for csv_file_name in file_list:\n", " csv_file_path = file_path_dict.get(csv_file_name)\n", " chunk_container = pd.read_csv(csv_file_path, chunksize=CHUNK_SIZE,low_memory=False)\n", " for chunk in tqdm(chunk_container):\n", " chunk.to_csv(outfile_path, mode=\"a\", index=False, header=with_header)\n", " with_header=False\n", " first_one = False\n", "\n", " print(\"deleting\")\n", " for csv_file_name in file_list:\n", " csv_file_path = file_path_dict.get(csv_file_name)\n", " print(csv_file_path)\n", " os.remove(csv_file_path)" ], "metadata": { "collapsed": false, "pycharm": { "is_executing": true } } } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 0 }