4 changed files with 2191 additions and 521 deletions
File diff suppressed because one or more lines are too long
@ -1,499 +0,0 @@
|
||||
{ |
||||
"cells": [ |
||||
{ |
||||
"cell_type": "code", |
||||
"execution_count": null, |
||||
"metadata": {}, |
||||
"outputs": [], |
||||
"source": [ |
||||
"test_file = 'OUROCARD_VISA_INFINITE-Ago_24.txt'" |
||||
] |
||||
}, |
||||
{ |
||||
"cell_type": "code", |
||||
"execution_count": null, |
||||
"metadata": {}, |
||||
"outputs": [], |
||||
"source": [ |
||||
"with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r') as reader:\n", |
||||
" data = reader.read()\n", |
||||
" print(data)" |
||||
] |
||||
}, |
||||
{ |
||||
"cell_type": "code", |
||||
"execution_count": null, |
||||
"metadata": {}, |
||||
"outputs": [], |
||||
"source": [ |
||||
"import re\n", |
||||
"\n", |
||||
"# Open the text file\n", |
||||
"with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r') as file:\n", |
||||
" # Read the contents of the file\n", |
||||
" contents = file.read()\n", |
||||
"\n", |
||||
"# Define the regex pattern to match\n", |
||||
"pattern = r'\\d{2}\\.\\d{2}\\.\\d{4}.{23}.{14}.{2}\\s*\\d+,\\d{2}\\s*\\d+,\\d{2}'\n", |
||||
"\n", |
||||
"# Iterate over the lines that match the pattern\n", |
||||
"for matches in re.finditer(pattern, contents):\n", |
||||
" print(matches.group())" |
||||
] |
||||
}, |
||||
{ |
||||
"cell_type": "code", |
||||
"execution_count": null, |
||||
"metadata": {}, |
||||
"outputs": [], |
||||
"source": [ |
||||
"import re\n", |
||||
"\n", |
||||
"# Open the text file\n", |
||||
"with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r') as file:\n", |
||||
" # Read the contents of the file\n", |
||||
" contents = file.read()\n", |
||||
"\n", |
||||
"# Define the regex pattern to match\n", |
||||
"pattern = r'.*DANIEL.*|.*IZABELY.*'\n", |
||||
"\n", |
||||
"# Iterate over the lines that match the pattern\n", |
||||
"for matches in re.finditer(pattern, contents):\n", |
||||
" print(matches.group())" |
||||
] |
||||
}, |
||||
{ |
||||
"cell_type": "code", |
||||
"execution_count": null, |
||||
"metadata": {}, |
||||
"outputs": [], |
||||
"source": [ |
||||
"import re\n", |
||||
"\n", |
||||
"# Open the text file\n", |
||||
"with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r') as file:\n", |
||||
" # Read the contents of the file\n", |
||||
" contents = file.read()\n", |
||||
"\n", |
||||
"# Define the regex patterns\n", |
||||
"dan_pattern = r'*DANIEL.*'\n", |
||||
"iza_pattern = r'.*IZABELY.*'\n", |
||||
"line_pattern = r'\\d{2}\\.\\d{2}\\.\\d{4}.{23}.{14}.{2}\\s*\\d+,\\d{2}\\s*\\d+,\\d{2}'\n", |
||||
"\n", |
||||
"# Iterate over the lines that match the pattern\n", |
||||
"for matches in re.finditer(line_pattern, contents):\n", |
||||
" print(matches.group())\n" |
||||
] |
||||
}, |
||||
{ |
||||
"cell_type": "code", |
||||
"execution_count": null, |
||||
"metadata": {}, |
||||
"outputs": [], |
||||
"source": [ |
||||
"# Open the text file\n", |
||||
"with open('table-test.txt', 'r') as file:\n", |
||||
" # Read the contents of the file\n", |
||||
" contents = file.readlines()\n", |
||||
"\n", |
||||
"# Initialize lists to store the lines under each table\n", |
||||
"table_a_lines = []\n", |
||||
"table_b_lines = []\n", |
||||
"\n", |
||||
"# Flag to determine which table section we are in\n", |
||||
"current_table = None\n", |
||||
"\n", |
||||
"# Iterate over the lines in the file\n", |
||||
"for line in contents:\n", |
||||
" line = line.strip() # Remove leading and trailing whitespace\n", |
||||
"\n", |
||||
" # Check for TABLEA and TABLEB\n", |
||||
" if line == 'TABLEA':\n", |
||||
" current_table = 'TABLEA'\n", |
||||
" elif line == 'TABLEB':\n", |
||||
" current_table = 'TABLEB'\n", |
||||
" else:\n", |
||||
" # Add lines to the appropriate list based on the current table\n", |
||||
" if current_table == 'TABLEA':\n", |
||||
" table_a_lines.append(line)\n", |
||||
" elif current_table == 'TABLEB':\n", |
||||
" table_b_lines.append(line)\n", |
||||
"\n", |
||||
"# Print the results\n", |
||||
"print('Lines under TABLEA:')\n", |
||||
"for data in table_a_lines:\n", |
||||
" print(data)\n", |
||||
"\n", |
||||
"print('\\nLines under TABLEB:')\n", |
||||
"for data in table_b_lines:\n", |
||||
" print(data)" |
||||
] |
||||
}, |
||||
{ |
||||
"cell_type": "code", |
||||
"execution_count": null, |
||||
"metadata": {}, |
||||
"outputs": [], |
||||
"source": [ |
||||
"import re\n", |
||||
"from datetime import date, datetime\n", |
||||
"import locale\n", |
||||
"\n", |
||||
"locale.setlocale(locale.LC_ALL, 'pt_BR.UTF-8')\n", |
||||
"\n", |
||||
"# Open the text file\n", |
||||
"with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r', encoding='latin') as file:\n", |
||||
" # Read the contents of the file\n", |
||||
" contents = file.readlines()\n", |
||||
"\n", |
||||
"# Define the regex patterns\n", |
||||
"dan_pattern = r'1 - DANIEL.*'\n", |
||||
"iza_pattern = r'4 - IZABELY.*'\n", |
||||
"line_pattern = r'\\d{2}\\.\\d{2}\\.\\d{4}.{23}.{14}.{2}\\s*\\d+,\\d{2}\\s*\\d+,\\d{2}'\n", |
||||
"line_group_pattern = r'(\\d{2})\\.(\\d{2})\\.(\\d{4})(.{23})(.{14})(.{2})(\\s*\\d+,\\d{2})(\\s*\\d+,\\d{2})'\n", |
||||
"\n", |
||||
"# Lists\n", |
||||
"list_dan = []\n", |
||||
"list_iza = []\n", |
||||
"current_list = None\n", |
||||
"\n", |
||||
"insert_bulk = []\n", |
||||
"\n", |
||||
"# Iterate all lines\n", |
||||
"for line in contents:\n", |
||||
" line = line.strip()\n", |
||||
" if re.match(dan_pattern, line):\n", |
||||
" current_list = 'list_dan'\n", |
||||
" print('found Dan')\n", |
||||
" elif re.match(iza_pattern, line):\n", |
||||
" current_list = 'list_iza'\n", |
||||
" print('found Iza')\n", |
||||
" else:\n", |
||||
" if re.match(line_pattern, line):\n", |
||||
" if current_list == 'list_dan':\n", |
||||
" print(\"dan\", line)\n", |
||||
" list_dan.append(line)\n", |
||||
" if current_list == 'list_iza':\n", |
||||
" print(\"iza\", line)\n", |
||||
" list_iza.append(line)\n", |
||||
"\n", |
||||
"print('list_dan - tuples for insert')\n", |
||||
"for item in list_dan:\n", |
||||
" matches = re.search(line_group_pattern, item)\n", |
||||
" tTdate = str(date(int(matches.group(3)), int(matches.group(2)), int(matches.group(1))))\n", |
||||
" tAccount = 1\n", |
||||
" tMemo = matches.group(4)\n", |
||||
" tCity = matches.group(5)\n", |
||||
" tCountry = matches.group(6)\n", |
||||
" tOutflow = matches.group(7).strip().replace(',', '.')\n", |
||||
" tInflow = matches.group(8).strip().replace(',', '.')\n", |
||||
" tOwner = 1\n", |
||||
" tInstallments = 1\n", |
||||
" tCreated = str(datetime.now(tz=None))\n", |
||||
" tUpdated = None\n", |
||||
" insert_bulk.append(( tTdate, tAccount, tMemo, tCity, tCountry, tOutflow, tInflow, tOwner, tInstallments, tCreated, tUpdated ))\n", |
||||
"\n", |
||||
"print('list_dan - tuples for insert')\n", |
||||
"for item in list_iza:\n", |
||||
" matches = re.search(line_group_pattern, item)\n", |
||||
" tTdate = str(date(int(matches.group(3)), int(matches.group(2)), int(matches.group(1))))\n", |
||||
" tAccount = 1\n", |
||||
" tMemo = matches.group(4)\n", |
||||
" tCity = matches.group(5)\n", |
||||
" tCountry = matches.group(6)\n", |
||||
" tOutflow = matches.group(7).strip().replace(',', '.')\n", |
||||
" tInflow = matches.group(8).strip().replace(',', '.')\n", |
||||
" tOwner = 2\n", |
||||
" tInstallments = 1\n", |
||||
" tCreated = str(datetime.now(tz=None))\n", |
||||
" tUpdated = None\n", |
||||
" insert_bulk.append(( tTdate, tAccount, tMemo, tCity, tCountry, tOutflow, tInflow, tOwner, tInstallments, tCreated, tUpdated ))\n" |
||||
] |
||||
}, |
||||
{ |
||||
"cell_type": "code", |
||||
"execution_count": null, |
||||
"metadata": {}, |
||||
"outputs": [], |
||||
"source": [ |
||||
"insert_query = \"INSERT INTO default.TRANSACTION (TDATE, ACCOUNTID, MEMO, CITY, COUNTRY, OUTFLOW, INFLOW, OWNERID, INSTALLMENT_NR, INSTALLMENT_TT, CREATED, UPDATED) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s )\"" |
||||
] |
||||
}, |
||||
{ |
||||
"cell_type": "code", |
||||
"execution_count": null, |
||||
"metadata": {}, |
||||
"outputs": [], |
||||
"source": [ |
||||
"def create_lists():\n", |
||||
" import re\n", |
||||
"\n", |
||||
" # Open the text file\n", |
||||
" with open(\"OUROCARD_VISA_INFINITE-Ago_24.txt\", \"r\", encoding=\"latin\") as file:\n", |
||||
" # Read the contents of the file\n", |
||||
" contents = file.readlines()\n", |
||||
"\n", |
||||
" # Define the regex patterns\n", |
||||
" owner_pattern = r\"\\d\\s*-\\s*(\\w+)\"\n", |
||||
" line_pattern = r\"\\d{2}\\.\\d{2}\\.\\d{4}.{23}.{14}.{2}\\s*-?\\d*\\.?\\d+,\\d{2}\\s*\\d+,\\d{2}\"\n", |
||||
" payment_pattern = (r\"\\d{2}\\.\\d{2}\\.\\d{4}PGTO.*200211(\\s*-?\\d*\\.?\\d+,\\d{2})(\\s*\\d+,\\d{2})\")\n", |
||||
"\n", |
||||
" # Lists\n", |
||||
" current_list = None\n", |
||||
" owner_list = []\n", |
||||
" result = {}\n", |
||||
"\n", |
||||
" silly_counter = 1\n", |
||||
"\n", |
||||
" # Find Owners\n", |
||||
" for line in contents:\n", |
||||
" line = line.strip()\n", |
||||
"\n", |
||||
" found_owners = re.findall(owner_pattern, line)\n", |
||||
" if found_owners:\n", |
||||
" for owner_name in found_owners:\n", |
||||
" list_name = f\"list_{owner_name.lower()}\"\n", |
||||
" owner_list.append(list_name)\n", |
||||
" result[list_name] = {}\n", |
||||
" result[list_name][\"owner_name\"] = owner_name\n", |
||||
" result[list_name][\"owner_id\"] = silly_counter\n", |
||||
" silly_counter = silly_counter + 1\n", |
||||
"\n", |
||||
" for line in contents:\n", |
||||
" line = line.strip()\n", |
||||
"\n", |
||||
" if re.match(owner_pattern, line):\n", |
||||
" found_owner = re.match(owner_pattern, line)\n", |
||||
" owner_list = f\"list_{found_owner.group(1).lower()}\"\n", |
||||
" current_list = owner_list\n", |
||||
" result[current_list][\"tlist\"] = []\n", |
||||
" else:\n", |
||||
" if re.match(payment_pattern, line):\n", |
||||
" result[current_list][\"tlist\"].append(line)\n", |
||||
" elif re.match(line_pattern, line):\n", |
||||
" result[current_list][\"tlist\"].append(line)\n", |
||||
"\n", |
||||
" return result" |
||||
] |
||||
}, |
||||
{ |
||||
"cell_type": "code", |
||||
"execution_count": null, |
||||
"metadata": {}, |
||||
"outputs": [], |
||||
"source": [ |
||||
"print(create_lists())" |
||||
] |
||||
}, |
||||
{ |
||||
"cell_type": "code", |
||||
"execution_count": null, |
||||
"metadata": {}, |
||||
"outputs": [], |
||||
"source": [ |
||||
"def build_insert(input_dict: dict, account: int):\n", |
||||
" from datetime import date, datetime\n", |
||||
" import re\n", |
||||
"\n", |
||||
" insert_bulk = []\n", |
||||
" line_group_pattern = r\"(\\d{2})\\.(\\d{2})\\.(\\d{4})((.+PARC (\\d+.)\\/(\\d+))(\\s.{12})|(.{23})(.{14}))(.{2})(\\s*-?\\d*\\.?\\d+,\\d{2})(\\s*\\d*\\.?\\d+,\\d{2})\"\n", |
||||
" payment_pattern = r\"(\\d{2})\\.(\\d{2})\\.(\\d{4})(PGTO DEBITO CONTA).*200211(\\s*-?\\d*\\.?\\d+,\\d{2})(\\s*\\d+,\\d{2})\"\n", |
||||
"\n", |
||||
" for key in input_dict:\n", |
||||
" for item in input_dict[key][\"tlist\"]:\n", |
||||
" # * check for payment\n", |
||||
" matches = re.match(payment_pattern, item)\n", |
||||
" if matches:\n", |
||||
" tTdate = str(\n", |
||||
" date(\n", |
||||
" int(matches.group(3)),\n", |
||||
" int(matches.group(2)),\n", |
||||
" int(matches.group(1)),\n", |
||||
" )\n", |
||||
" )\n", |
||||
" tAccount = account\n", |
||||
" tMemo = matches.group(4)\n", |
||||
" tCity = None\n", |
||||
" tCountry = None\n", |
||||
" tOutflow = None\n", |
||||
" tInflow = matches.group(5).strip().replace(\".\", \"\").replace(\",\", \".\")\n", |
||||
" tOwner = input_dict[key][\"owner_id\"]\n", |
||||
" tInstallmentNr = None\n", |
||||
" tInstallmentTt = None\n", |
||||
" tCreated = str(datetime.now(tz=None))\n", |
||||
" tUpdated = None\n", |
||||
" else:\n", |
||||
" matches = re.match(line_group_pattern, item)\n", |
||||
" tTdate = str(\n", |
||||
" date(\n", |
||||
" int(matches.group(3)),\n", |
||||
" int(matches.group(2)),\n", |
||||
" int(matches.group(1)),\n", |
||||
" )\n", |
||||
" )\n", |
||||
" tAccount = account\n", |
||||
"\n", |
||||
" # * check for Installments\n", |
||||
" if matches.group(5):\n", |
||||
" tMemo = matches.group(5)\n", |
||||
" tCity = matches.group(8)\n", |
||||
" tInstallmentNr = int(matches.group(6))\n", |
||||
" tInstallmentTt = int(matches.group(7))\n", |
||||
" else:\n", |
||||
" tMemo = matches.group(9)\n", |
||||
" tCity = matches.group(10)\n", |
||||
" tInstallmentNr = 1\n", |
||||
" tInstallmentTt = None\n", |
||||
"\n", |
||||
" tCountry = matches.group(11)\n", |
||||
" tOutflow = matches.group(12).strip().replace(\".\", \"\").replace(\",\", \".\")\n", |
||||
" tInflow = matches.group(13).strip().replace(\".\", \"\").replace(\",\", \".\")\n", |
||||
" tOwner = input_dict[key][\"owner_id\"]\n", |
||||
"\n", |
||||
" tCreated = str(datetime.now(tz=None))\n", |
||||
" tUpdated = None\n", |
||||
" insert_bulk.append(\n", |
||||
" (\n", |
||||
" tTdate,\n", |
||||
" tAccount,\n", |
||||
" tMemo,\n", |
||||
" tCity,\n", |
||||
" tCountry,\n", |
||||
" tOutflow,\n", |
||||
" tInflow,\n", |
||||
" tOwner,\n", |
||||
" tInstallmentNr,\n", |
||||
" tInstallmentTt,\n", |
||||
" tCreated,\n", |
||||
" tUpdated,\n", |
||||
" )\n", |
||||
" )\n", |
||||
"\n", |
||||
" return insert_bulk" |
||||
] |
||||
}, |
||||
{ |
||||
"cell_type": "code", |
||||
"execution_count": null, |
||||
"metadata": {}, |
||||
"outputs": [], |
||||
"source": [ |
||||
"def db_insert(insert_bulk: list[tuple]):\n", |
||||
" from mysql.connector import connect, Error\n", |
||||
"\n", |
||||
" try:\n", |
||||
" with connect(\n", |
||||
" host=\"localhost\",\n", |
||||
" user=\"root\",\n", |
||||
" password=\"pleasehashapasswordomg\",\n", |
||||
" database=\"default\",\n", |
||||
" ) as connection:\n", |
||||
" print(\"CONNECTED!\", connection)\n", |
||||
" with connection.cursor() as cursor:\n", |
||||
" cursor.executemany(insert_query, insert_bulk)\n", |
||||
" connection.commit()\n", |
||||
" print(\"DONE!\")\n", |
||||
" except Error as e:\n", |
||||
" print(e)\n", |
||||
" finally:\n", |
||||
" connection.close()" |
||||
] |
||||
}, |
||||
{ |
||||
"cell_type": "code", |
||||
"execution_count": null, |
||||
"metadata": {}, |
||||
"outputs": [], |
||||
"source": [ |
||||
"db_insert(build_insert(create_lists(), 1))" |
||||
] |
||||
}, |
||||
{ |
||||
"cell_type": "code", |
||||
"execution_count": null, |
||||
"metadata": {}, |
||||
"outputs": [], |
||||
"source": [ |
||||
"build_insert(create_lists(), 1)" |
||||
] |
||||
}, |
||||
{ |
||||
"cell_type": "code", |
||||
"execution_count": null, |
||||
"metadata": {}, |
||||
"outputs": [], |
||||
"source": [ |
||||
"create_lists()" |
||||
] |
||||
}, |
||||
{ |
||||
"cell_type": "code", |
||||
"execution_count": null, |
||||
"metadata": {}, |
||||
"outputs": [], |
||||
"source": [ |
||||
"dictTest = {\n", |
||||
" \"owner1\": {\n", |
||||
" \"owner_label\": \"foo\",\n", |
||||
" \"owner_id\": 1,\n", |
||||
" \"list1\": [\"thingies, thingies, 42\"],\n", |
||||
" },\n", |
||||
" \"owner2\": {\n", |
||||
" \"owner_label\": \"bar\",\n", |
||||
" \"owner_id\": 2,\n", |
||||
" \"list1\": [\"thingies, thingies, 42\"],\n", |
||||
" },\n", |
||||
"}\n", |
||||
"\n", |
||||
"for owner in dictTest:\n", |
||||
" print(dictTest[owner][\"owner_id\"], dictTest[owner][\"owner_label\"])\n", |
||||
" for item in dictTest[owner][\"list1\"]:\n", |
||||
" print(item)\n", |
||||
"\n", |
||||
"dictTest[\"owner1\"][\"owner_label\"] = \"yadda\"\n", |
||||
"\n", |
||||
"for owner in dictTest:\n", |
||||
" print(dictTest[owner][\"owner_id\"], dictTest[owner][\"owner_label\"])\n", |
||||
" for item in dictTest[owner][\"list1\"]:\n", |
||||
" print(item)" |
||||
] |
||||
}, |
||||
{ |
||||
"cell_type": "code", |
||||
"execution_count": null, |
||||
"metadata": {}, |
||||
"outputs": [], |
||||
"source": [ |
||||
"param1 = \"foo\"\n", |
||||
"param2 = \"bar\"\n", |
||||
"testy = {}\n", |
||||
"testy[param1] = {}\n", |
||||
"testy[param1][param2] = [\"what\", \"when\", \"why\"]\n", |
||||
"testy[param1][\"number\"] = 1\n", |
||||
"\n", |
||||
"print(testy)" |
||||
] |
||||
} |
||||
], |
||||
"metadata": { |
||||
"kernelspec": { |
||||
"display_name": "Python 3", |
||||
"language": "python", |
||||
"name": "python3" |
||||
}, |
||||
"language_info": { |
||||
"codemirror_mode": { |
||||
"name": "ipython", |
||||
"version": 3 |
||||
}, |
||||
"file_extension": ".py", |
||||
"mimetype": "text/x-python", |
||||
"name": "python", |
||||
"nbconvert_exporter": "python", |
||||
"pygments_lexer": "ipython3", |
||||
"version": "3.12.3" |
||||
} |
||||
}, |
||||
"nbformat": 4, |
||||
"nbformat_minor": 2 |
||||
} |
||||
@ -1,11 +0,0 @@
|
||||
import pandas |
||||
import pdfquery |
||||
|
||||
#read the PDF |
||||
pdf = pdfquery.PDFQuery('cc.pdf') |
||||
pdf.load() |
||||
|
||||
|
||||
#convert the pdf to XML |
||||
pdf.tree.write('cc.xml', pretty_print = True) |
||||
pdf |
||||
Loading…
Reference in new issue