You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
319 lines
17 KiB
319 lines
17 KiB
{ |
|
"cells": [ |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": {}, |
|
"outputs": [], |
|
"source": [ |
|
"test_file = 'OUROCARD_VISA_INFINITE-Ago_24.txt'" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": {}, |
|
"outputs": [], |
|
"source": [ |
|
"from pypdf import PdfReader\n", |
|
"\n", |
|
"reader = PdfReader('cc.pdf')\n", |
|
"number_of_pages = len(reader.pages)\n", |
|
"output = ''\n", |
|
"for i in range(number_of_pages):\n", |
|
" page = reader.pages[i]\n", |
|
" output += page.extract_text()\n", |
|
"print(output)" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": {}, |
|
"outputs": [], |
|
"source": [ |
|
"with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r') as reader:\n", |
|
" data = reader.read()\n", |
|
" print(data)" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": {}, |
|
"outputs": [], |
|
"source": [ |
|
"import re\n", |
|
"\n", |
|
"# Open the text file\n", |
|
"with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r') as file:\n", |
|
" # Read the contents of the file\n", |
|
" contents = file.read()\n", |
|
"\n", |
|
"# Define the regex pattern to match\n", |
|
"pattern = r'\\d{2}\\.\\d{2}\\.\\d{4}.{23}.{14}.{2}\\s*\\d+,\\d{2}\\s*\\d+,\\d{2}'\n", |
|
"\n", |
|
"# Iterate over the lines that match the pattern\n", |
|
"for match in re.finditer(pattern, contents):\n", |
|
" print(match.group())" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": {}, |
|
"outputs": [], |
|
"source": [ |
|
"import re\n", |
|
"\n", |
|
"# Open the text file\n", |
|
"with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r') as file:\n", |
|
" # Read the contents of the file\n", |
|
" contents = file.read()\n", |
|
"\n", |
|
"# Define the regex pattern to match\n", |
|
"pattern = r'.*DANIEL.*|.*IZABELY.*'\n", |
|
"\n", |
|
"# Iterate over the lines that match the pattern\n", |
|
"for match in re.finditer(pattern, contents):\n", |
|
" print(match.group())" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": {}, |
|
"outputs": [], |
|
"source": [ |
|
"import re\n", |
|
"\n", |
|
"# Open the text file\n", |
|
"with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r') as file:\n", |
|
" # Read the contents of the file\n", |
|
" contents = file.read()\n", |
|
"\n", |
|
"# Define the regex patterns\n", |
|
"dan_pattern = r'*DANIEL.*'\n", |
|
"iza_pattern = r'.*IZABELY.*'\n", |
|
"line_pattern = r'\\d{2}\\.\\d{2}\\.\\d{4}.{23}.{14}.{2}\\s*\\d+,\\d{2}\\s*\\d+,\\d{2}'\n", |
|
"\n", |
|
"# Iterate over the lines that match the pattern\n", |
|
"for match in re.finditer(line_pattern, contents):\n", |
|
" print(match.group())\n" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": {}, |
|
"outputs": [], |
|
"source": [ |
|
"# Open the text file\n", |
|
"with open('table-test.txt', 'r') as file:\n", |
|
" # Read the contents of the file\n", |
|
" contents = file.readlines()\n", |
|
"\n", |
|
"# Initialize lists to store the lines under each table\n", |
|
"table_a_lines = []\n", |
|
"table_b_lines = []\n", |
|
"\n", |
|
"# Flag to determine which table section we are in\n", |
|
"current_table = None\n", |
|
"\n", |
|
"# Iterate over the lines in the file\n", |
|
"for line in contents:\n", |
|
" line = line.strip() # Remove leading and trailing whitespace\n", |
|
"\n", |
|
" # Check for TABLEA and TABLEB\n", |
|
" if line == 'TABLEA':\n", |
|
" current_table = 'TABLEA'\n", |
|
" elif line == 'TABLEB':\n", |
|
" current_table = 'TABLEB'\n", |
|
" else:\n", |
|
" # Add lines to the appropriate list based on the current table\n", |
|
" if current_table == 'TABLEA':\n", |
|
" table_a_lines.append(line)\n", |
|
" elif current_table == 'TABLEB':\n", |
|
" table_b_lines.append(line)\n", |
|
"\n", |
|
"# Print the results\n", |
|
"print('Lines under TABLEA:')\n", |
|
"for data in table_a_lines:\n", |
|
" print(data)\n", |
|
"\n", |
|
"print('\\nLines under TABLEB:')\n", |
|
"for data in table_b_lines:\n", |
|
" print(data)" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": 74, |
|
"metadata": {}, |
|
"outputs": [ |
|
{ |
|
"name": "stdout", |
|
"output_type": "stream", |
|
"text": [ |
|
"found Dan\n", |
|
"found Iza\n", |
|
"Lines under list_dan:\n", |
|
"[('2024-07-15', 'NEW EMPAR EMPREENDIMENT', 'CORUMBA DE GO ', 'BR', Decimal('1000'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-07-14', 'NAZO SUSHI BAR ', 'BRASILIA ', 'BR', Decimal('44622'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-07-18', 'IFD*IFOOD.COM AGENCIA D', 'Osasco ', 'BR', Decimal('1290'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-07-19', 'IFD*RC MELO COMERCIO D ', 'BRASILIA ', 'BR', Decimal('13830'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-07-21', 'GRUPO FARTURA DE HORTI ', 'BRASILIA ', 'BR', Decimal('10579'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-07-20', 'IFD*JL COMERCIO VAREJIS', 'BRASILIA ', 'BR', Decimal('13470'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-07-22', 'MURAKAMI ', 'BRASILIA ', 'BR', Decimal('5590'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-07-22', 'ZP *CANTINAGOODLANCHEV ', 'Brasilia ', 'BR', Decimal('840'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-07-23', 'CANTINA E CIA ', 'BRASILIA ', 'BR', Decimal('1000'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-07-25', 'CANTINA E CIA ', 'BRASILIA ', 'BR', Decimal('350'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-07-24', 'IFD*NFE COMERCIO DE ALI', 'BRASILIA ', 'BR', Decimal('10189'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-07-27', 'BENITA PANINOTECA ', 'BRASILIA ', 'BR', Decimal('15680'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-07-26', 'IFD*BSQUARE PIZZA BURGE', 'BRASILIA ', 'BR', Decimal('12399'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-08-04', 'IFD*RC MELO COMERCIO DE', 'BRASILIA ', 'BR', Decimal('10329'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-08-06', 'CANTINA E CIA ', 'BRASILIA ', 'BR', Decimal('600'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-08-06', 'ZP *CANTINAGOODLANCHEV ', 'Brasilia ', 'BR', Decimal('21000'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-08-07', 'IFD*BSQUARE PIZZA BURGE', 'BRASILIA ', 'BR', Decimal('23299'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-07-22', 'DROGASIL 2067 ', 'BRASILIA ', 'BR', Decimal('20499'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-07-22', 'REDE BRASIL DRUGSTORE ', 'BRASILIA ', 'BR', Decimal('2674'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-08-10', 'PAGUE MENOS 1225 ', 'BRASILIA ', 'BR', Decimal('40630'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-07-13', 'PAG*EduardoMeireles ', 'AGUAS LINDAS ', 'BR', Decimal('1400'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-07-13', 'MR JOHN BARBEARIA LTDA ', 'BRASILIA ', 'BR', Decimal('6000'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-07-18', 'NETFLIX.COM ', 'SAO PAULO ', 'BR', Decimal('4490'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-07-19', 'PAG*FolhaDeSPaulo ', 'SAO PAULO ', 'BR', Decimal('2990'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-07-19', 'IFD*CR EXPRESS ', 'Osasco ', 'BR', Decimal('1000'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-07-22', 'APPLE.COM/BILL ', 'SAO PAULO ', 'BR', Decimal('9790'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-07-22', 'MERCADOLIVRE*MERCADOLIV', 'OSASCO ', 'BR', Decimal('44440'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-07-25', 'MP*5PRODUTOS ', 'OSASCO ', 'BR', Decimal('30893'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-07-25', 'MERCADOLIVRE*SABORESDAM', 'OSASCO ', 'BR', Decimal('25800'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-07-27', 'MP*MELIMAIS ', 'OSASCO ', 'BR', Decimal('1799'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-07-27', 'Wellhub Gympass BR Gymp', 'Sao Paulo ', 'BR', Decimal('39990'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-07-26', 'IFD*CR EXPRESS ', 'Osasco ', 'BR', Decimal('1000'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-07-29', 'MG LAVA JATO ', 'BRASILIA ', 'BR', Decimal('10000'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-07-29', 'MERCADOLIVRE*GLDECOR ', 'OSASCO ', 'BR', Decimal('5075'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-08-01', 'MR JOHN BARBEARIA LTDA ', 'BRASILIA ', 'BR', Decimal('5400'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-08-04', 'APPLE.COM/BILL ', 'SAO PAULO ', 'BR', Decimal('5490'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-08-04', 'IFD*SCORPIONS EXPRESS ', 'Osasco ', 'BR', Decimal('1000'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-08-08', 'CASCOL COMBUSTIVEIS ', 'BRASILIA ', 'BR', Decimal('23937'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-08-07', 'IFD*GRAN LOG EXPRESS ', 'Osasco ', 'BR', Decimal('1000'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-08-09', 'SUPERAUTOR C*Supe ', 'NITEROI ', 'BR', Decimal('24786'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-08-09', 'UBER* TRIP ', 'WWW.UBER.COM. ', 'BR', Decimal('3089'), Decimal('0'), 1, '', '2024-08-27 15:06:40.471820'), ('2024-08-09', 'UBER *TRIP HELP.UBER.CO', 'SAO PAULO ', 'BR', Decimal('1000'), Decimal('0'), 1, '', '2024-08-27 15:06:40.472828'), ('2024-08-09', 'UBER * PENDING ', 'SAO PAULO ', 'BR', Decimal('2793'), Decimal('0'), 1, '', '2024-08-27 15:06:40.472828'), ('2024-08-09', 'UBER *TRIP HELP.UBER.CO', 'SAO PAULO ', 'BR', Decimal('500'), Decimal('0'), 1, '', '2024-08-27 15:06:40.472828'), ('2024-07-15', 'VELOE ', 'BARUERI ', 'BR', Decimal('2226'), Decimal('0'), 1, '', '2024-08-27 15:06:40.472828'), ('2024-07-19', 'POUSADA PIRENEUS RESOR ', 'PIRENOPOLIS ', 'BR', Decimal('17921'), Decimal('0'), 1, '', '2024-08-27 15:06:40.472828'), ('2024-07-13', 'DL*GOOGLE YouTub ', 'SAO PAULO ', 'BR', Decimal('4190'), Decimal('0'), 1, '', '2024-08-27 15:06:40.472828'), ('2024-07-16', 'STEAMGAMES.COM 42595229', '912-1844160 ', 'WA', Decimal('2400'), Decimal('0'), 1, '', '2024-08-27 15:06:40.472828'), ('2024-07-16', 'IOF - COMPRA NO EXTERIO', 'R ', ' ', Decimal('26'), Decimal('0'), 1, '', '2024-08-27 15:06:40.472828'), ('2024-07-16', 'STEAM PURCHASE ', 'SEATTLE ', 'DE', Decimal('12719'), Decimal('0'), 1, '', '2024-08-27 15:06:40.472828'), ('2024-07-18', 'IOF - COMPRA NO EXTERIO', 'R ', ' ', Decimal('139'), Decimal('0'), 1, '', '2024-08-27 15:06:40.472828'), ('2024-07-22', 'PAG*XsollaGames ', 'Sao Paulo ', 'BR', Decimal('2699'), Decimal('0'), 1, '', '2024-08-27 15:06:40.472828'), ('2024-04-11', 'PRODUTOS GLOB PARC 04/1', '2 RIO DE JANEI', 'BR', Decimal('4490'), Decimal('0'), 1, '', '2024-08-27 15:06:40.472828'), ('2024-01-15', 'MP*MUNDODOSCO PARC 07/1', '0 SAO PAULO ', 'BR', Decimal('15990'), Decimal('0'), 1, '', '2024-08-27 15:06:40.472828'), ('2024-05-17', 'PAG*Folhadesp PARC 03/0', '6 Sao Paulo ', 'BR', Decimal('10960'), Decimal('0'), 1, '', '2024-08-27 15:06:40.472828'), ('2023-10-17', 'BIANCHINI AUT PARC 10/1', '0 BRASILIA ', 'BR', Decimal('53500'), Decimal('0'), 1, '', '2024-08-27 15:06:40.472828'), ('2024-05-27', 'PARC=112 BRAS PARC 03/1', '2 BRASILIA ', 'BR', Decimal('45200'), Decimal('0'), 1, '', '2024-08-27 15:06:40.472828')]\n" |
|
] |
|
} |
|
], |
|
"source": [ |
|
"import re\n", |
|
"from datetime import date, datetime\n", |
|
"from decimal import *\n", |
|
"import locale\n", |
|
"\n", |
|
"locale.setlocale(locale.LC_ALL, 'BRA')\n", |
|
"getcontext().prec = 2\n", |
|
"\n", |
|
"# Open the text file\n", |
|
"with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r') as file:\n", |
|
" # Read the contents of the file\n", |
|
" contents = file.readlines()\n", |
|
"\n", |
|
"# Define the regex patterns\n", |
|
"dan_pattern = r'1 - DANIEL.*'\n", |
|
"iza_pattern = r'4 - IZABELY.*'\n", |
|
"line_pattern = r'\\d{2}\\.\\d{2}\\.\\d{4}.{23}.{14}.{2}\\s*\\d+,\\d{2}\\s*\\d+,\\d{2}'\n", |
|
"line_group_pattern = r'(\\d{2})\\.(\\d{2})\\.(\\d{4})(.{23})(.{14})(.{2})(\\s*\\d+,\\d{2})(\\s*\\d+,\\d{2})'\n", |
|
"\n", |
|
"# Lists\n", |
|
"list_dan = []\n", |
|
"list_iza = []\n", |
|
"current_list = None\n", |
|
"\n", |
|
"insert_bulk = []\n", |
|
"\n", |
|
"# Iterate all lines\n", |
|
"for line in contents:\n", |
|
" line = line.strip()\n", |
|
" # print(line)\n", |
|
" if re.match(dan_pattern, line):\n", |
|
" current_list = 'list_dan'\n", |
|
" print('found Dan')\n", |
|
" elif re.match(iza_pattern, line):\n", |
|
" current_list = 'list_iza'\n", |
|
" print('found Iza')\n", |
|
" else:\n", |
|
" # print(line)\n", |
|
" if re.match(line_pattern, line):\n", |
|
" if current_list == 'list_dan':\n", |
|
" list_dan.append(line)\n", |
|
" elif current_list == 'list_iza':\n", |
|
" list_iza.append(line)\n", |
|
"\n", |
|
"# Print the results\n", |
|
"print('Lines under list_dan:')\n", |
|
"for item in list_dan:\n", |
|
" # print(item)\n", |
|
" match = re.search(line_group_pattern, item)\n", |
|
" # print(match.group(1))\n", |
|
" # build tuples for inserting\n", |
|
" tDate = str(date(int(match.group(3)), int(match.group(2)), int(match.group(1))))\n", |
|
" tDesc = match.group(4)\n", |
|
" tCity = match.group(5)\n", |
|
" tCountry = match.group(6)\n", |
|
" tOutflow = Decimal(locale.atoi(match.group(7).strip()))\n", |
|
" tInflow = Decimal(locale.atoi(match.group(8).strip()))\n", |
|
" tOwner = 1\n", |
|
" tInstallments = \"\"\n", |
|
" tCreation = str(datetime.now(tz=None))\n", |
|
" insert_bulk.append((tDate, tDesc, tCity, tCountry, tOutflow, tInflow, tOwner, tInstallments, tCreation))\n", |
|
"print(insert_bulk)\n", |
|
"\n", |
|
"\n", |
|
"# print('\\nLines under list_iza:')\n", |
|
"# for item in list_iza:\n", |
|
"# # print(item)\n", |
|
"# match = re.search(line_group_pattern, item)\n", |
|
"# print(match.group(1))\n", |
|
"# tDate = date(int(match.group(3)), int(match.group(2)), int(match.group(1)))\n", |
|
"# tDesc = match.group(4)\n", |
|
"# tCity = match.group(5)\n", |
|
"# tCountry = match.group(6)\n", |
|
"# tOutflow = match.group(7)\n", |
|
"# tInflow = match.group(8)\n", |
|
"# tOwner = \"2\"\n", |
|
"# tInstallments = \"\"\n", |
|
"# tCreation = datetime.now(tz=None)\n", |
|
"# insert_bulk.append((tDate, tDesc, tCity, tCountry, tOutflow, tInflow, tOwner, tInstallments, tCreation))\n", |
|
"# print(insert_bulk)\n", |
|
"\n" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": 77, |
|
"metadata": {}, |
|
"outputs": [], |
|
"source": [ |
|
"insert_query =\"\"\"INSERT INTO transaction (Date, Desc, City, Country, Outflow, Inflow, OwnerId, Installments, created_at) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s)\"\"\"\n" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": 78, |
|
"metadata": {}, |
|
"outputs": [ |
|
{ |
|
"name": "stdout", |
|
"output_type": "stream", |
|
"text": [ |
|
"CONNECTED! <mysql.connector.connection_cext.CMySQLConnection object at 0x0000022FA34A3560>\n", |
|
"1064 (42000): You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near 'Desc, City, Country, Outflow, Inflow, OwnerId, Installments, created_at) VALUES ' at line 1\n" |
|
] |
|
} |
|
], |
|
"source": [ |
|
"# from getpass import getpass\n", |
|
"from mysql.connector import connect, Error\n", |
|
"\n", |
|
"try:\n", |
|
" with connect(\n", |
|
" host='localhost',\n", |
|
" user='root',\n", |
|
" password='pleasehashapasswordomg',\n", |
|
" database='default'\n", |
|
" ) as connection:\n", |
|
" print(\"CONNECTED!\", connection)\n", |
|
" with connection.cursor() as cursor:\n", |
|
" cursor.executemany(insert_query, insert_bulk)\n", |
|
"except Error as e:\n", |
|
" print(e)" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": {}, |
|
"outputs": [], |
|
"source": [] |
|
} |
|
], |
|
"metadata": { |
|
"kernelspec": { |
|
"display_name": "Python 3", |
|
"language": "python", |
|
"name": "python3" |
|
}, |
|
"language_info": { |
|
"codemirror_mode": { |
|
"name": "ipython", |
|
"version": 3 |
|
}, |
|
"file_extension": ".py", |
|
"mimetype": "text/x-python", |
|
"name": "python", |
|
"nbconvert_exporter": "python", |
|
"pygments_lexer": "ipython3", |
|
"version": "3.12.5" |
|
} |
|
}, |
|
"nbformat": 4, |
|
"nbformat_minor": 2 |
|
}
|
|
|