init commit
commit
e060613834
@ -0,0 +1,8 @@
|
||||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
# Editor-based HTTP Client requests
|
||||
/httpRequests/
|
||||
# Datasource local storage ignored files
|
||||
/dataSources/
|
||||
/dataSources.local.xml
|
@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="jdk" jdkName="Python 3.9 (MOME_BIGDATA)" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
@ -0,0 +1,6 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<profile version="1.0">
|
||||
<option name="myName" value="Project Default" />
|
||||
<inspection_tool class="JupyterPackageInspection" enabled="false" level="WARNING" enabled_by_default="false" />
|
||||
</profile>
|
||||
</component>
|
@ -0,0 +1,6 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
@ -0,0 +1,4 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (MOME_BIGDATA)" project-jdk-type="Python SDK" />
|
||||
</project>
|
@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/ZSI_analytics.iml" filepath="$PROJECT_DIR$/.idea/ZSI_analytics.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,292 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 143,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import pyarrow\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"import os\n",
|
||||
"import shutil"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": "'2.0.0rc1'"
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"pd.__version__"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "OptionError",
|
||||
"evalue": "'You can only set the value of existing options'",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
|
||||
"\u001B[1;31mOptionError\u001B[0m Traceback (most recent call last)",
|
||||
"Cell \u001B[1;32mIn[12], line 1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m \u001B[43mpd\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43moptions\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mmode\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mdtype_backend\u001B[49m \u001B[38;5;241m=\u001B[39m \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mpyarrow\u001B[39m\u001B[38;5;124m'\u001B[39m\n",
|
||||
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\pandas\\_config\\config.py:226\u001B[0m, in \u001B[0;36mDictWrapper.__setattr__\u001B[1;34m(self, key, val)\u001B[0m\n\u001B[0;32m 224\u001B[0m _set_option(prefix, val)\n\u001B[0;32m 225\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m--> 226\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m OptionError(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mYou can only set the value of existing options\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n",
|
||||
"\u001B[1;31mOptionError\u001B[0m: 'You can only set the value of existing options'"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# pd.options.mode #= 'pyarrow'"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 119,
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_01\n",
|
||||
"D:\\PATSTAT\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"print(os.getcwd()) # Prints the current working directory\n",
|
||||
"\n",
|
||||
"workdir_path=r\"D:\\PATSTAT\"\n",
|
||||
"os.chdir(workdir_path)\n",
|
||||
"print(os.getcwd())"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"CONCAT: ['tls201_part01.csv', 'tls201_part02.csv', 'tls201_part03.csv'] TO D:\\PATSTAT\\table_tls201.csv\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"1000it [07:12, 2.31it/s]\n",
|
||||
"1000it [07:25, 2.25it/s]\n",
|
||||
"429it [02:55, 2.44it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"deleting\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_01\\tls201_part01.csv\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_01\\tls201_part02.csv\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_01\\tls201_part03.csv\n",
|
||||
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_01\\tls202_part01.csv TO D:\\PATSTAT\\table_tls202.csv\n",
|
||||
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_05\\tls204_part01\\tls204_part01.csv TO D:\\PATSTAT\\table_tls204.csv\n",
|
||||
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_05\\tls205_part01\\tls205_part01.csv TO D:\\PATSTAT\\table_tls205.csv\n",
|
||||
"CONCAT: ['tls206_part01.csv', 'tls206_part02.csv'] TO D:\\PATSTAT\\table_tls206.csv\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"1000it [07:21, 2.27it/s]\n",
|
||||
"715it [05:32, 2.15it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"deleting\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_05\\tls206_part01\\tls206_part01.csv\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_05\\tls206_part02\\tls206_part02.csv\n",
|
||||
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_05\\tls207_part01\\tls207_part01.csv TO D:\\PATSTAT\\table_tls207.csv\n",
|
||||
"CONCAT: ['tls209_part01.csv', 'tls209_part02.csv'] TO D:\\PATSTAT\\table_tls209.csv\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"3200it [07:35, 7.02it/s]\n",
|
||||
"3049it [07:15, 7.01it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"deleting\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_06\\tls209_part01\\tls209_part01.csv\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_06\\tls209_part02\\tls209_part02.csv\n",
|
||||
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_06\\tls210_part01\\tls210_part01.csv TO D:\\PATSTAT\\table_tls210.csv\n",
|
||||
"CONCAT: ['tls211_part01.csv', 'tls211_part02.csv'] TO D:\\PATSTAT\\table_tls211.csv\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2500it [08:45, 4.76it/s]\n",
|
||||
"369it [01:23, 4.42it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"deleting\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_06\\tls211_part01\\tls211_part01.csv\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_06\\tls211_part02\\tls211_part02.csv\n",
|
||||
"CONCAT: ['tls212_part01.csv', 'tls212_part02.csv', 'tls212_part03.csv'] TO D:\\PATSTAT\\table_tls212.csv\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"4000it [12:45, 5.23it/s]\n",
|
||||
"4000it [12:57, 5.14it/s]\n",
|
||||
"1232it [03:54, 5.25it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"deleting\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_06\\tls212_part01\\tls212_part01.csv\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_07\\tls212_part02\\tls212_part02.csv\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_07\\tls212_part03\\tls212_part03.csv\n",
|
||||
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_07\\tls214_part01\\tls214_part01.csv TO D:\\PATSTAT\\table_tls214.csv\n",
|
||||
"CONCAT: ['tls215_part01.csv', 'tls215_part02.csv', 'tls215_part03.csv'] TO D:\\PATSTAT\\table_tls215.csv\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"8000it [15:10, 8.79it/s]\n",
|
||||
"3524it [06:32, 9.30it/s]"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"file_path_dict=dict()\n",
|
||||
"\n",
|
||||
"# iterate over files in\n",
|
||||
"# that directory\n",
|
||||
"for root, dirs, files in os.walk(workdir_path):\n",
|
||||
" for filename in files:\n",
|
||||
" if filename.endswith(\".csv\") and not filename.startswith(\"table\"):\n",
|
||||
" path= os.path.join(root, filename)\n",
|
||||
" file_path_dict[filename] = path\n",
|
||||
"\n",
|
||||
"complete_file_set = set()\n",
|
||||
"for fname in file_path_dict.keys():\n",
|
||||
" complete_file_set.add(fname.split(\"_\")[0])\n",
|
||||
"complete_file_set = sorted(complete_file_set)\n",
|
||||
"\n",
|
||||
"for complete_file in complete_file_set:\n",
|
||||
" file_list = [file for file in file_path_dict.keys() if complete_file in file]\n",
|
||||
"\n",
|
||||
" outfile_path = os.path.join(workdir_path,\"table_\"+complete_file+\".csv\")\n",
|
||||
" # print(outfile_path,file_list)\n",
|
||||
"\n",
|
||||
" if len(file_list)==1:\n",
|
||||
" file_path = file_path_dict.get(file_list[0])\n",
|
||||
" print(\"MOVE:\",file_path ,\"TO\",outfile_path)\n",
|
||||
" shutil.move(file_path, outfile_path)\n",
|
||||
" else:\n",
|
||||
" print(\"CONCAT:\",file_list ,\"TO\",outfile_path)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" CHUNK_SIZE = 50000\n",
|
||||
" with_header=True\n",
|
||||
" first_one = True\n",
|
||||
" for csv_file_name in file_list:\n",
|
||||
" csv_file_path = file_path_dict.get(csv_file_name)\n",
|
||||
" chunk_container = pd.read_csv(csv_file_path, chunksize=CHUNK_SIZE,low_memory=False)\n",
|
||||
" for chunk in tqdm(chunk_container):\n",
|
||||
" chunk.to_csv(outfile_path, mode=\"a\", index=False, header=with_header)\n",
|
||||
" with_header=False\n",
|
||||
" first_one = False\n",
|
||||
"\n",
|
||||
" print(\"deleting\")\n",
|
||||
" for csv_file_name in file_list:\n",
|
||||
" csv_file_path = file_path_dict.get(csv_file_name)\n",
|
||||
" print(csv_file_path)\n",
|
||||
" os.remove(csv_file_path)"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"is_executing": true
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 2
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython2",
|
||||
"version": "2.7.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
@ -0,0 +1,16 @@
|
||||
# This is a sample Python script.
|
||||
|
||||
# Press Shift+F10 to execute it or replace it with your code.
|
||||
# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
|
||||
|
||||
|
||||
def print_hi(name):
|
||||
# Use a breakpoint in the code line below to debug your script.
|
||||
print(f'Hi, {name}') # Press Ctrl+F8 to toggle the breakpoint.
|
||||
|
||||
|
||||
# Press the green button in the gutter to run the script.
|
||||
if __name__ == '__main__':
|
||||
print_hi('PyCharm')
|
||||
|
||||
# See PyCharm help at https://www.jetbrains.com/help/pycharm/
|
Loading…
Reference in New Issue