Compare commits
No commits in common. 'main' and 'utku_keyword_suggestion' have entirely different histories.
main
...
utku_keywo
@ -1,12 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="DataSourceManagerImpl" format="xml" multifile-model="true">
|
||||
<data-source source="LOCAL" name="MOME_Adam@dataw16.mome.local" read-only="true" uuid="389ec0ff-653b-4714-abd4-4f8bc9c6a8f3">
|
||||
<driver-ref>sqlserver.ms</driver-ref>
|
||||
<synchronize>true</synchronize>
|
||||
<jdbc-driver>com.microsoft.sqlserver.jdbc.SQLServerDriver</jdbc-driver>
|
||||
<jdbc-url>jdbc:sqlserver://dataw16.mome.local:14333;database=MOME_Adam</jdbc-url>
|
||||
<working-dir>$ProjectFileDir$</working-dir>
|
||||
</data-source>
|
||||
</component>
|
||||
</project>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -0,0 +1,402 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 143,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import pyarrow\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"import os\n",
|
||||
"import shutil"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": "'2.0.0rc1'"
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"pd.__version__"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "OptionError",
|
||||
"evalue": "'You can only set the value of existing options'",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
|
||||
"\u001B[1;31mOptionError\u001B[0m Traceback (most recent call last)",
|
||||
"Cell \u001B[1;32mIn[12], line 1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m \u001B[43mpd\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43moptions\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mmode\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mdtype_backend\u001B[49m \u001B[38;5;241m=\u001B[39m \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mpyarrow\u001B[39m\u001B[38;5;124m'\u001B[39m\n",
|
||||
"File \u001B[1;32m~\\.conda\\envs\\MOME_BIGDATA\\lib\\site-packages\\pandas\\_config\\config.py:226\u001B[0m, in \u001B[0;36mDictWrapper.__setattr__\u001B[1;34m(self, key, val)\u001B[0m\n\u001B[0;32m 224\u001B[0m _set_option(prefix, val)\n\u001B[0;32m 225\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m--> 226\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m OptionError(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mYou can only set the value of existing options\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n",
|
||||
"\u001B[1;31mOptionError\u001B[0m: 'You can only set the value of existing options'"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# pd.options.mode #= 'pyarrow'"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 119,
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_01\n",
|
||||
"D:\\PATSTAT\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"print(os.getcwd()) # Prints the current working directory\n",
|
||||
"\n",
|
||||
"workdir_path=r\"D:\\PATSTAT\"\n",
|
||||
"os.chdir(workdir_path)\n",
|
||||
"print(os.getcwd())"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 148,
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"CONCAT: ['tls201_part01.csv', 'tls201_part02.csv', 'tls201_part03.csv'] TO D:\\PATSTAT\\table_tls201.csv\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"1000it [07:12, 2.31it/s]\n",
|
||||
"1000it [07:25, 2.25it/s]\n",
|
||||
"429it [02:55, 2.44it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"deleting\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_01\\tls201_part01.csv\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_01\\tls201_part02.csv\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_01\\tls201_part03.csv\n",
|
||||
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_01\\tls202_part01.csv TO D:\\PATSTAT\\table_tls202.csv\n",
|
||||
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_05\\tls204_part01\\tls204_part01.csv TO D:\\PATSTAT\\table_tls204.csv\n",
|
||||
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_05\\tls205_part01\\tls205_part01.csv TO D:\\PATSTAT\\table_tls205.csv\n",
|
||||
"CONCAT: ['tls206_part01.csv', 'tls206_part02.csv'] TO D:\\PATSTAT\\table_tls206.csv\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"1000it [07:21, 2.27it/s]\n",
|
||||
"715it [05:32, 2.15it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"deleting\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_05\\tls206_part01\\tls206_part01.csv\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_05\\tls206_part02\\tls206_part02.csv\n",
|
||||
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_05\\tls207_part01\\tls207_part01.csv TO D:\\PATSTAT\\table_tls207.csv\n",
|
||||
"CONCAT: ['tls209_part01.csv', 'tls209_part02.csv'] TO D:\\PATSTAT\\table_tls209.csv\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"3200it [07:35, 7.02it/s]\n",
|
||||
"3049it [07:15, 7.01it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"deleting\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_06\\tls209_part01\\tls209_part01.csv\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_06\\tls209_part02\\tls209_part02.csv\n",
|
||||
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_06\\tls210_part01\\tls210_part01.csv TO D:\\PATSTAT\\table_tls210.csv\n",
|
||||
"CONCAT: ['tls211_part01.csv', 'tls211_part02.csv'] TO D:\\PATSTAT\\table_tls211.csv\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2500it [08:45, 4.76it/s]\n",
|
||||
"369it [01:23, 4.42it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"deleting\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_06\\tls211_part01\\tls211_part01.csv\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_06\\tls211_part02\\tls211_part02.csv\n",
|
||||
"CONCAT: ['tls212_part01.csv', 'tls212_part02.csv', 'tls212_part03.csv'] TO D:\\PATSTAT\\table_tls212.csv\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"4000it [12:45, 5.23it/s]\n",
|
||||
"4000it [12:57, 5.14it/s]\n",
|
||||
"1232it [03:54, 5.25it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"deleting\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_06\\tls212_part01\\tls212_part01.csv\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_07\\tls212_part02\\tls212_part02.csv\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_07\\tls212_part03\\tls212_part03.csv\n",
|
||||
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_07\\tls214_part01\\tls214_part01.csv TO D:\\PATSTAT\\table_tls214.csv\n",
|
||||
"CONCAT: ['tls215_part01.csv', 'tls215_part02.csv', 'tls215_part03.csv'] TO D:\\PATSTAT\\table_tls215.csv\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"8000it [15:10, 8.79it/s]\n",
|
||||
"8000it [14:20, 9.30it/s]\n",
|
||||
"1294it [02:10, 9.95it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"deleting\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_08\\tls215_part01\\tls215_part01.csv\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_08\\tls215_part02\\tls215_part02.csv\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_08\\tls215_part03\\tls215_part03.csv\n",
|
||||
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_08\\tls216_part01\\tls216_part01.csv TO D:\\PATSTAT\\table_tls216.csv\n",
|
||||
"CONCAT: ['tls222_part01.csv', 'tls222_part02.csv'] TO D:\\PATSTAT\\table_tls222.csv\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"5000it [09:11, 9.06it/s]\n",
|
||||
"2402it [04:24, 9.07it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"deleting\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_08\\tls222_part01\\tls222_part01.csv\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_08\\tls222_part02\\tls222_part02.csv\n",
|
||||
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_08\\tls223_part01\\tls223_part01.csv TO D:\\PATSTAT\\table_tls223.csv\n",
|
||||
"CONCAT: ['tls224_part01.csv', 'tls224_part02.csv', 'tls224_part03.csv'] TO D:\\PATSTAT\\table_tls224.csv\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"3000it [04:43, 10.60it/s]\n",
|
||||
"3000it [04:44, 10.53it/s]\n",
|
||||
"1131it [01:48, 10.38it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"deleting\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_08\\tls224_part01\\tls224_part01.csv\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_08\\tls224_part02\\tls224_part02.csv\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_08\\tls224_part03\\tls224_part03.csv\n",
|
||||
"CONCAT: ['tls225_part01.csv', 'tls225_part02.csv'] TO D:\\PATSTAT\\table_tls225.csv\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"3000it [09:19, 5.37it/s]\n",
|
||||
"529it [01:39, 5.32it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"deleting\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_08\\tls225_part01\\tls225_part01.csv\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_08\\tls225_part02\\tls225_part02.csv\n",
|
||||
"CONCAT: ['tls226_part01.csv', 'tls226_part02.csv'] TO D:\\PATSTAT\\table_tls226.csv\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"1500it [11:54, 2.10it/s]\n",
|
||||
"581it [04:50, 2.00it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"deleting\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_09\\tls226_part01\\tls226_part01.csv\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_09\\tls226_part02\\tls226_part02.csv\n",
|
||||
"CONCAT: ['tls227_part01.csv', 'tls227_part02.csv'] TO D:\\PATSTAT\\table_tls227.csv\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"8000it [11:59, 11.12it/s]\n",
|
||||
"434it [00:39, 11.09it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"deleting\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_09\\tls227_part01\\tls227_part01.csv\n",
|
||||
"D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_09\\tls227_part02\\tls227_part02.csv\n",
|
||||
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_09\\tls228_part01\\tls228_part01.csv TO D:\\PATSTAT\\table_tls228.csv\n",
|
||||
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_10\\tls229_part01\\tls229_part01.csv TO D:\\PATSTAT\\table_tls229.csv\n",
|
||||
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_10\\tls230_part01\\tls230_part01.csv TO D:\\PATSTAT\\table_tls230.csv\n",
|
||||
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_10\\tls801_part01\\tls801_part01.csv TO D:\\PATSTAT\\table_tls801.csv\n",
|
||||
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_10\\tls803_part01\\tls803_part01.csv TO D:\\PATSTAT\\table_tls803.csv\n",
|
||||
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_10\\tls901_part01\\tls901_part01.csv TO D:\\PATSTAT\\table_tls901.csv\n",
|
||||
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_10\\tls902_part01\\tls902_part01.csv TO D:\\PATSTAT\\table_tls902.csv\n",
|
||||
"MOVE: D:\\PATSTAT\\data_PATSTAT_Global_2022_Autumn_10\\tls904_part01\\tls904_part01.csv TO D:\\PATSTAT\\table_tls904.csv\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"file_path_dict=dict()\n",
|
||||
"\n",
|
||||
"# iterate over files in\n",
|
||||
"# that directory\n",
|
||||
"for root, dirs, files in os.walk(workdir_path):\n",
|
||||
" for filename in files:\n",
|
||||
" if filename.endswith(\".csv\") and not filename.startswith(\"table\"):\n",
|
||||
" path= os.path.join(root, filename)\n",
|
||||
" file_path_dict[filename] = path\n",
|
||||
"\n",
|
||||
"complete_file_set = set()\n",
|
||||
"for fname in file_path_dict.keys():\n",
|
||||
" complete_file_set.add(fname.split(\"_\")[0])\n",
|
||||
"complete_file_set = sorted(complete_file_set)\n",
|
||||
"\n",
|
||||
"for complete_file in complete_file_set:\n",
|
||||
" file_list = [file for file in file_path_dict.keys() if complete_file in file]\n",
|
||||
"\n",
|
||||
" outfile_path = os.path.join(workdir_path,\"table_\"+complete_file+\".csv\")\n",
|
||||
" # print(outfile_path,file_list)\n",
|
||||
"\n",
|
||||
" if len(file_list)==1:\n",
|
||||
" file_path = file_path_dict.get(file_list[0])\n",
|
||||
" print(\"MOVE:\",file_path ,\"TO\",outfile_path)\n",
|
||||
" shutil.move(file_path, outfile_path)\n",
|
||||
" else:\n",
|
||||
" print(\"CONCAT:\",file_list ,\"TO\",outfile_path)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" CHUNK_SIZE = 50000\n",
|
||||
" with_header=True\n",
|
||||
" first_one = True\n",
|
||||
" for csv_file_name in file_list:\n",
|
||||
" csv_file_path = file_path_dict.get(csv_file_name)\n",
|
||||
" chunk_container = pd.read_csv(csv_file_path, chunksize=CHUNK_SIZE,low_memory=False)\n",
|
||||
" for chunk in tqdm(chunk_container):\n",
|
||||
" chunk.to_csv(outfile_path, mode=\"a\", index=False, header=with_header)\n",
|
||||
" with_header=False\n",
|
||||
" first_one = False\n",
|
||||
"\n",
|
||||
" print(\"deleting\")\n",
|
||||
" for csv_file_name in file_list:\n",
|
||||
" csv_file_path = file_path_dict.get(csv_file_name)\n",
|
||||
" print(csv_file_path)\n",
|
||||
" os.remove(csv_file_path)"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 2
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython2",
|
||||
"version": "2.7.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
@ -1,375 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import pyarrow\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"import os\n",
|
||||
"import shutil"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": "'2.0.0rc1'"
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"pd.__version__"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\PATSTAT\n",
|
||||
"D:\\PATSTAT\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"print(os.getcwd()) # Prints the current working directory\n",
|
||||
"\n",
|
||||
"workdir_path=r\"D:\\PATSTAT\"\n",
|
||||
"os.chdir(workdir_path)\n",
|
||||
"print(os.getcwd())"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"CONCAT: ['tls201_part01.csv', 'tls201_part02.csv', 'tls201_part03.csv'] TO D:\\PATSTAT\\table_tls201.csv\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"1000it [08:05, 2.06it/s]\n",
|
||||
"1000it [08:23, 1.99it/s]\n",
|
||||
"506it [04:04, 2.07it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"deleting\n",
|
||||
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_01\\tls201_part01\\tls201_part01.csv\n",
|
||||
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_01\\tls201_part02\\tls201_part02.csv\n",
|
||||
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_01\\tls201_part03\\tls201_part03.csv\n",
|
||||
"MOVE: D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_01\\tls202_part01\\tls202_part01.csv TO D:\\PATSTAT\\table_tls202.csv\n",
|
||||
"MOVE: D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_05\\tls204_part01\\tls204_part01.csv TO D:\\PATSTAT\\table_tls204.csv\n",
|
||||
"MOVE: D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_05\\tls205_part01\\tls205_part01.csv TO D:\\PATSTAT\\table_tls205.csv\n",
|
||||
"CONCAT: ['tls206_part01.csv', 'tls206_part02.csv'] TO D:\\PATSTAT\\table_tls206.csv\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"1000it [08:15, 2.02it/s]\n",
|
||||
"777it [06:38, 1.95it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"deleting\n",
|
||||
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_05\\tls206_part01\\tls206_part01.csv\n",
|
||||
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_06\\tls206_part02\\tls206_part02.csv\n",
|
||||
"MOVE: D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_06\\tls207_part01\\tls207_part01.csv TO D:\\PATSTAT\\table_tls207.csv\n",
|
||||
"CONCAT: ['tls209_part01.csv', 'tls209_part02.csv', 'tls209_part03.csv'] TO D:\\PATSTAT\\table_tls209.csv\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"3200it [07:58, 6.69it/s]\n",
|
||||
"3200it [07:54, 6.75it/s]\n",
|
||||
"217it [00:31, 6.96it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"deleting\n",
|
||||
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_06\\tls209_part01\\tls209_part01.csv\n",
|
||||
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_06\\tls209_part02\\tls209_part02.csv\n",
|
||||
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_06\\tls209_part03\\tls209_part03.csv\n",
|
||||
"MOVE: D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_06\\tls210_part01\\tls210_part01.csv TO D:\\PATSTAT\\table_tls210.csv\n",
|
||||
"CONCAT: ['tls211_part01.csv', 'tls211_part02.csv'] TO D:\\PATSTAT\\table_tls211.csv\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2500it [08:18, 5.01it/s]\n",
|
||||
"463it [01:43, 4.46it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"deleting\n",
|
||||
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_07\\tls211_part01\\tls211_part01.csv\n",
|
||||
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_07\\tls211_part02\\tls211_part02.csv\n",
|
||||
"CONCAT: ['tls212_part01.csv', 'tls212_part02.csv', 'tls212_part03.csv'] TO D:\\PATSTAT\\table_tls212.csv\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"4000it [10:46, 6.19it/s]\n",
|
||||
"4000it [10:54, 6.11it/s]\n",
|
||||
"1764it [04:47, 6.13it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"deleting\n",
|
||||
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_07\\tls212_part01\\tls212_part01.csv\n",
|
||||
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_07\\tls212_part02\\tls212_part02.csv\n",
|
||||
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_08\\tls212_part03\\tls212_part03.csv\n",
|
||||
"MOVE: D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_08\\tls214_part01\\tls214_part01.csv TO D:\\PATSTAT\\table_tls214.csv\n",
|
||||
"CONCAT: ['tls215_part01.csv', 'tls215_part02.csv', 'tls215_part03.csv'] TO D:\\PATSTAT\\table_tls215.csv\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"8000it [12:35, 10.60it/s]\n",
|
||||
"8000it [12:24, 10.74it/s]\n",
|
||||
"3101it [04:50, 10.66it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"deleting\n",
|
||||
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_08\\tls215_part01\\tls215_part01.csv\n",
|
||||
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_08\\tls215_part02\\tls215_part02.csv\n",
|
||||
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_09\\tls215_part03\\tls215_part03.csv\n",
|
||||
"MOVE: D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_09\\tls216_part01\\tls216_part01.csv TO D:\\PATSTAT\\table_tls216.csv\n",
|
||||
"CONCAT: ['tls222_part01.csv', 'tls222_part02.csv'] TO D:\\PATSTAT\\table_tls222.csv\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"5000it [08:34, 9.71it/s]\n",
|
||||
"2489it [04:16, 9.70it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"deleting\n",
|
||||
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_09\\tls222_part01\\tls222_part01.csv\n",
|
||||
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_09\\tls222_part02\\tls222_part02.csv\n",
|
||||
"CONCAT: ['tls224_part01.csv', 'tls224_part02.csv', 'tls224_part03.csv'] TO D:\\PATSTAT\\table_tls224.csv\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"3000it [04:24, 11.35it/s]\n",
|
||||
"3000it [04:27, 11.20it/s]\n",
|
||||
"1442it [02:07, 11.29it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"deleting\n",
|
||||
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_09\\tls224_part01\\tls224_part01.csv\n",
|
||||
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_09\\tls224_part02\\tls224_part02.csv\n",
|
||||
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_09\\tls224_part03\\tls224_part03.csv\n",
|
||||
"CONCAT: ['tls225_part01.csv', 'tls225_part02.csv'] TO D:\\PATSTAT\\table_tls225.csv\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"3000it [08:29, 5.89it/s]\n",
|
||||
"718it [02:02, 5.88it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"deleting\n",
|
||||
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_09\\tls225_part01\\tls225_part01.csv\n",
|
||||
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_09\\tls225_part02\\tls225_part02.csv\n",
|
||||
"CONCAT: ['tls226_part01.csv', 'tls226_part02.csv'] TO D:\\PATSTAT\\table_tls226.csv\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"1500it [10:40, 2.34it/s]\n",
|
||||
"664it [04:56, 2.24it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"deleting\n",
|
||||
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_09\\tls226_part01\\tls226_part01.csv\n",
|
||||
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_10\\tls226_part02\\tls226_part02.csv\n",
|
||||
"CONCAT: ['tls227_part01.csv', 'tls227_part02.csv'] TO D:\\PATSTAT\\table_tls227.csv\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"8000it [12:42, 10.49it/s]\n",
|
||||
"862it [01:22, 10.39it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"deleting\n",
|
||||
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_10\\tls227_part01\\tls227_part01.csv\n",
|
||||
"D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_10\\tls227_part02\\tls227_part02.csv\n",
|
||||
"MOVE: D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_10\\tls228_part01\\tls228_part01.csv TO D:\\PATSTAT\\table_tls228.csv\n",
|
||||
"MOVE: D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_10\\tls229_part01\\tls229_part01.csv TO D:\\PATSTAT\\table_tls229.csv\n",
|
||||
"MOVE: D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_10\\tls230_part01\\tls230_part01.csv TO D:\\PATSTAT\\table_tls230.csv\n",
|
||||
"MOVE: D:\\PATSTAT\\PATSTAT_spring\\data_PATSTAT_Global_2023_Spring_11\\tls801_part01\\tls801_part01.csv TO D:\\PATSTAT\\table_tls801.csv\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"file_path_dict=dict()\n",
|
||||
"\n",
|
||||
"# iterate over files in\n",
|
||||
"# that directory\n",
|
||||
"for root, dirs, files in os.walk(workdir_path):\n",
|
||||
" for filename in files:\n",
|
||||
" if filename.endswith(\".csv\") and not filename.startswith(\"table\"):\n",
|
||||
" path= os.path.join(root, filename)\n",
|
||||
" file_path_dict[filename] = path\n",
|
||||
"\n",
|
||||
"complete_file_set = set()\n",
|
||||
"for fname in file_path_dict.keys():\n",
|
||||
" complete_file_set.add(fname.split(\"_\")[0])\n",
|
||||
"complete_file_set = sorted(complete_file_set)\n",
|
||||
"\n",
|
||||
"for complete_file in complete_file_set:\n",
|
||||
" file_list = [file for file in file_path_dict.keys() if complete_file in file]\n",
|
||||
"\n",
|
||||
" outfile_path = os.path.join(workdir_path,\"table_\"+complete_file+\".csv\")\n",
|
||||
" # print(outfile_path,file_list)\n",
|
||||
"\n",
|
||||
" if len(file_list)==1:\n",
|
||||
" file_path = file_path_dict.get(file_list[0])\n",
|
||||
" print(\"MOVE:\",file_path ,\"TO\",outfile_path)\n",
|
||||
" shutil.move(file_path, outfile_path)\n",
|
||||
" else:\n",
|
||||
" print(\"CONCAT:\",file_list ,\"TO\",outfile_path)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" CHUNK_SIZE = 50000\n",
|
||||
" with_header = True\n",
|
||||
" first_one = True\n",
|
||||
" for csv_file_name in file_list:\n",
|
||||
" csv_file_path = file_path_dict.get(csv_file_name)\n",
|
||||
" chunk_container = pd.read_csv(csv_file_path, chunksize=CHUNK_SIZE,low_memory=False)\n",
|
||||
" for chunk in tqdm(chunk_container):\n",
|
||||
" chunk.to_csv(outfile_path, mode=\"a\", index=False, header=with_header)\n",
|
||||
" with_header=False\n",
|
||||
" first_one = False\n",
|
||||
"\n",
|
||||
" print(\"deleting\")\n",
|
||||
" for csv_file_name in file_list:\n",
|
||||
" csv_file_path = file_path_dict.get(csv_file_name)\n",
|
||||
" print(csv_file_path)\n",
|
||||
" os.remove(csv_file_path)"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 2
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython2",
|
||||
"version": "2.7.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
Before Width: | Height: | Size: 988 KiB |
Before Width: | Height: | Size: 718 KiB |
Before Width: | Height: | Size: 742 KiB |
Before Width: | Height: | Size: 653 KiB |
Before Width: | Height: | Size: 755 KiB |
Before Width: | Height: | Size: 1.3 MiB |
Before Width: | Height: | Size: 536 KiB |
Before Width: | Height: | Size: 756 KiB |
Before Width: | Height: | Size: 742 KiB |
Before Width: | Height: | Size: 764 KiB |
Before Width: | Height: | Size: 751 KiB |
Before Width: | Height: | Size: 900 KiB |
Before Width: | Height: | Size: 1010 KiB |
Before Width: | Height: | Size: 1.2 MiB |
Before Width: | Height: | Size: 1.0 MiB |
Before Width: | Height: | Size: 800 KiB |
Before Width: | Height: | Size: 808 KiB |
Before Width: | Height: | Size: 762 KiB |
Before Width: | Height: | Size: 785 KiB |
Before Width: | Height: | Size: 968 KiB |
Before Width: | Height: | Size: 747 KiB |
Before Width: | Height: | Size: 918 KiB |
Before Width: | Height: | Size: 1.1 MiB |
Before Width: | Height: | Size: 683 KiB |
Before Width: | Height: | Size: 832 KiB |
Before Width: | Height: | Size: 858 KiB |
Before Width: | Height: | Size: 598 KiB |
Before Width: | Height: | Size: 629 KiB |
Before Width: | Height: | Size: 703 KiB |
Before Width: | Height: | Size: 716 KiB |
Before Width: | Height: | Size: 841 KiB |
Before Width: | Height: | Size: 754 KiB |
Before Width: | Height: | Size: 726 KiB |
Before Width: | Height: | Size: 782 KiB |
Before Width: | Height: | Size: 779 KiB |
Before Width: | Height: | Size: 681 KiB |
Before Width: | Height: | Size: 785 KiB |
Before Width: | Height: | Size: 673 KiB |
Before Width: | Height: | Size: 746 KiB |
Before Width: | Height: | Size: 705 KiB |
Before Width: | Height: | Size: 807 KiB |
Before Width: | Height: | Size: 754 KiB |
Before Width: | Height: | Size: 736 KiB |
Before Width: | Height: | Size: 760 KiB |
Before Width: | Height: | Size: 738 KiB |
Before Width: | Height: | Size: 752 KiB |
Before Width: | Height: | Size: 729 KiB |
Before Width: | Height: | Size: 761 KiB |
Before Width: | Height: | Size: 746 KiB |
Before Width: | Height: | Size: 693 KiB |
Before Width: | Height: | Size: 723 KiB |
Before Width: | Height: | Size: 700 KiB |
Before Width: | Height: | Size: 623 KiB |
Before Width: | Height: | Size: 598 KiB |
Before Width: | Height: | Size: 626 KiB |