You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
509 lines
16 KiB
509 lines
16 KiB
{ |
|
"cells": [ |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": {}, |
|
"outputs": [], |
|
"source": [ |
|
"test_file = 'OUROCARD_VISA_INFINITE-Ago_24.txt'" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": {}, |
|
"outputs": [], |
|
"source": [ |
|
"from pypdf import PdfReader\n", |
|
"\n", |
|
"reader = PdfReader('cc.pdf')\n", |
|
"number_of_pages = len(reader.pages)\n", |
|
"output = ''\n", |
|
"for i in range(number_of_pages):\n", |
|
" page = reader.pages[i]\n", |
|
" output += page.extract_text()\n", |
|
"print(output)" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": {}, |
|
"outputs": [], |
|
"source": [ |
|
"with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r') as reader:\n", |
|
" data = reader.read()\n", |
|
" print(data)" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": {}, |
|
"outputs": [], |
|
"source": [ |
|
"import re\n", |
|
"\n", |
|
"# Open the text file\n", |
|
"with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r') as file:\n", |
|
" # Read the contents of the file\n", |
|
" contents = file.read()\n", |
|
"\n", |
|
"# Define the regex pattern to match\n", |
|
"pattern = r'\\d{2}\\.\\d{2}\\.\\d{4}.{23}.{14}.{2}\\s*\\d+,\\d{2}\\s*\\d+,\\d{2}'\n", |
|
"\n", |
|
"# Iterate over the lines that match the pattern\n", |
|
"for match in re.finditer(pattern, contents):\n", |
|
" print(match.group())" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": {}, |
|
"outputs": [], |
|
"source": [ |
|
"import re\n", |
|
"\n", |
|
"# Open the text file\n", |
|
"with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r') as file:\n", |
|
" # Read the contents of the file\n", |
|
" contents = file.read()\n", |
|
"\n", |
|
"# Define the regex pattern to match\n", |
|
"pattern = r'.*DANIEL.*|.*IZABELY.*'\n", |
|
"\n", |
|
"# Iterate over the lines that match the pattern\n", |
|
"for match in re.finditer(pattern, contents):\n", |
|
" print(match.group())" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": {}, |
|
"outputs": [], |
|
"source": [ |
|
"import re\n", |
|
"\n", |
|
"# Open the text file\n", |
|
"with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r') as file:\n", |
|
" # Read the contents of the file\n", |
|
" contents = file.read()\n", |
|
"\n", |
|
"# Define the regex patterns\n", |
|
"dan_pattern = r'*DANIEL.*'\n", |
|
"iza_pattern = r'.*IZABELY.*'\n", |
|
"line_pattern = r'\\d{2}\\.\\d{2}\\.\\d{4}.{23}.{14}.{2}\\s*\\d+,\\d{2}\\s*\\d+,\\d{2}'\n", |
|
"\n", |
|
"# Iterate over the lines that match the pattern\n", |
|
"for match in re.finditer(line_pattern, contents):\n", |
|
" print(match.group())\n" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": {}, |
|
"outputs": [], |
|
"source": [ |
|
"# Open the text file\n", |
|
"with open('table-test.txt', 'r') as file:\n", |
|
" # Read the contents of the file\n", |
|
" contents = file.readlines()\n", |
|
"\n", |
|
"# Initialize lists to store the lines under each table\n", |
|
"table_a_lines = []\n", |
|
"table_b_lines = []\n", |
|
"\n", |
|
"# Flag to determine which table section we are in\n", |
|
"current_table = None\n", |
|
"\n", |
|
"# Iterate over the lines in the file\n", |
|
"for line in contents:\n", |
|
" line = line.strip() # Remove leading and trailing whitespace\n", |
|
"\n", |
|
" # Check for TABLEA and TABLEB\n", |
|
" if line == 'TABLEA':\n", |
|
" current_table = 'TABLEA'\n", |
|
" elif line == 'TABLEB':\n", |
|
" current_table = 'TABLEB'\n", |
|
" else:\n", |
|
" # Add lines to the appropriate list based on the current table\n", |
|
" if current_table == 'TABLEA':\n", |
|
" table_a_lines.append(line)\n", |
|
" elif current_table == 'TABLEB':\n", |
|
" table_b_lines.append(line)\n", |
|
"\n", |
|
"# Print the results\n", |
|
"print('Lines under TABLEA:')\n", |
|
"for data in table_a_lines:\n", |
|
" print(data)\n", |
|
"\n", |
|
"print('\\nLines under TABLEB:')\n", |
|
"for data in table_b_lines:\n", |
|
" print(data)" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": {}, |
|
"outputs": [], |
|
"source": [ |
|
"import re\n", |
|
"from datetime import date, datetime\n", |
|
"import locale\n", |
|
"\n", |
|
"locale.setlocale(locale.LC_ALL, 'pt_BR.UTF-8')\n", |
|
"\n", |
|
"# Open the text file\n", |
|
"with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r', encoding='latin') as file:\n", |
|
" # Read the contents of the file\n", |
|
" contents = file.readlines()\n", |
|
"\n", |
|
"# Define the regex patterns\n", |
|
"dan_pattern = r'1 - DANIEL.*'\n", |
|
"iza_pattern = r'4 - IZABELY.*'\n", |
|
"line_pattern = r'\\d{2}\\.\\d{2}\\.\\d{4}.{23}.{14}.{2}\\s*\\d+,\\d{2}\\s*\\d+,\\d{2}'\n", |
|
"line_group_pattern = r'(\\d{2})\\.(\\d{2})\\.(\\d{4})(.{23})(.{14})(.{2})(\\s*\\d+,\\d{2})(\\s*\\d+,\\d{2})'\n", |
|
"\n", |
|
"# Lists\n", |
|
"list_dan = []\n", |
|
"list_iza = []\n", |
|
"current_list = None\n", |
|
"\n", |
|
"insert_bulk = []\n", |
|
"\n", |
|
"# Iterate all lines\n", |
|
"for line in contents:\n", |
|
" line = line.strip()\n", |
|
" if re.match(dan_pattern, line):\n", |
|
" current_list = 'list_dan'\n", |
|
" print('found Dan')\n", |
|
" elif re.match(iza_pattern, line):\n", |
|
" current_list = 'list_iza'\n", |
|
" print('found Iza')\n", |
|
" else:\n", |
|
" if re.match(line_pattern, line):\n", |
|
" if current_list == 'list_dan':\n", |
|
" print(\"dan\", line)\n", |
|
" list_dan.append(line)\n", |
|
" if current_list == 'list_iza':\n", |
|
" print(\"iza\", line)\n", |
|
" list_iza.append(line)\n", |
|
"\n", |
|
"print('list_dan - tuples for insert')\n", |
|
"for item in list_dan:\n", |
|
" match = re.search(line_group_pattern, item)\n", |
|
" tTdate = str(date(int(match.group(3)), int(match.group(2)), int(match.group(1))))\n", |
|
" tAccount = 1\n", |
|
" tMemo = match.group(4)\n", |
|
" tCity = match.group(5)\n", |
|
" tCountry = match.group(6)\n", |
|
" tOutflow = match.group(7).strip().replace(',', '.')\n", |
|
" tInflow = match.group(8).strip().replace(',', '.')\n", |
|
" tOwner = 1\n", |
|
" tInstallments = 1\n", |
|
" tCreated = str(datetime.now(tz=None))\n", |
|
" tUpdated = None\n", |
|
" insert_bulk.append(( tTdate, tAccount, tMemo, tCity, tCountry, tOutflow, tInflow, tOwner, tInstallments, tCreated, tUpdated ))\n", |
|
"\n", |
|
"print('list_dan - tuples for insert')\n", |
|
"for item in list_iza:\n", |
|
" match = re.search(line_group_pattern, item)\n", |
|
" tTdate = str(date(int(match.group(3)), int(match.group(2)), int(match.group(1))))\n", |
|
" tAccount = 1\n", |
|
" tMemo = match.group(4)\n", |
|
" tCity = match.group(5)\n", |
|
" tCountry = match.group(6)\n", |
|
" tOutflow = match.group(7).strip().replace(',', '.')\n", |
|
" tInflow = match.group(8).strip().replace(',', '.')\n", |
|
" tOwner = 2\n", |
|
" tInstallments = 1\n", |
|
" tCreated = str(datetime.now(tz=None))\n", |
|
" tUpdated = None\n", |
|
" insert_bulk.append(( tTdate, tAccount, tMemo, tCity, tCountry, tOutflow, tInflow, tOwner, tInstallments, tCreated, tUpdated ))\n" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": {}, |
|
"outputs": [], |
|
"source": [ |
|
"insert_query =\"INSERT INTO default.TRANSACTION (TDATE, ACCOUNTID, MEMO, CITY, COUNTRY, OUTFLOW, INFLOW, OWNERID, INSTALLMENTS, CREATED, UPDATED) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)\"" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": 45, |
|
"metadata": {}, |
|
"outputs": [], |
|
"source": [ |
|
"def create_lists():\n", |
|
" import re\n", |
|
"\n", |
|
" # Open the text file\n", |
|
" with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r', encoding='latin') as file:\n", |
|
" # Read the contents of the file\n", |
|
" contents = file.readlines()\n", |
|
"\n", |
|
" # Define the regex patterns\n", |
|
" dan_pattern = r'1 - DANIEL.*'\n", |
|
" iza_pattern = r'4 - IZABELY.*'\n", |
|
" line_pattern = r'\\d{2}\\.\\d{2}\\.\\d{4}.{23}.{14}.{2}\\s*\\d+,\\d{2}\\s*\\d+,\\d{2}'\n", |
|
"\n", |
|
" # Lists\n", |
|
" list_dan = []\n", |
|
" list_iza = []\n", |
|
" current_list = None\n", |
|
"\n", |
|
" # Iterate all lines\n", |
|
" for line in contents:\n", |
|
" line = line.strip()\n", |
|
" if re.match(dan_pattern, line):\n", |
|
" current_list = 'list_dan'\n", |
|
" elif re.match(iza_pattern, line):\n", |
|
" current_list = 'list_iza'\n", |
|
" else:\n", |
|
" if re.match(line_pattern, line):\n", |
|
" if current_list == 'list_dan':\n", |
|
" list_dan.append(line)\n", |
|
" if current_list == 'list_iza':\n", |
|
" list_iza.append(line)\n", |
|
" \n", |
|
" return [list_dan, list_iza]" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": 49, |
|
"metadata": {}, |
|
"outputs": [], |
|
"source": [ |
|
"def build_insert(lists: list[list, list], account: int, owner: int):\n", |
|
" from datetime import date, datetime\n", |
|
"\n", |
|
" insert_bulk = []\n", |
|
" line_group_pattern = r'(\\d{2})\\.(\\d{2})\\.(\\d{4})(.{23})(.{14})(.{2})(\\s*\\d+,\\d{2})(\\s*\\d+,\\d{2})'\n", |
|
"\n", |
|
" for batch in lists:\n", |
|
" for item in batch:\n", |
|
" match = re.search(line_group_pattern, item)\n", |
|
" tTdate = str(date(int(match.group(3)), int(match.group(2)), int(match.group(1))))\n", |
|
" tAccount = account\n", |
|
" tMemo = match.group(4)\n", |
|
" tCity = match.group(5)\n", |
|
" tCountry = match.group(6)\n", |
|
" tOutflow = match.group(7).strip().replace(',', '.')\n", |
|
" tInflow = match.group(8).strip().replace(',', '.')\n", |
|
" tOwner = owner\n", |
|
" tInstallments = 1\n", |
|
" tCreated = str(datetime.now(tz=None))\n", |
|
" tUpdated = None\n", |
|
" insert_bulk.append(( tTdate, tAccount, tMemo, tCity, tCountry, tOutflow, tInflow, tOwner, tInstallments, tCreated, tUpdated ))\n", |
|
" \n", |
|
" return insert_bulk" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": 47, |
|
"metadata": {}, |
|
"outputs": [], |
|
"source": [ |
|
"def db_insert(insert_bulk: list[tuple]):\n", |
|
" from mysql.connector import connect, Error\n", |
|
"\n", |
|
" try:\n", |
|
" with connect(\n", |
|
" host='localhost',\n", |
|
" user='root',\n", |
|
" password='pleasehashapasswordomg',\n", |
|
" database='default'\n", |
|
" ) as connection:\n", |
|
" print(\"CONNECTED!\", connection)\n", |
|
" with connection.cursor() as cursor:\n", |
|
" cursor.executemany(insert_query, insert_bulk)\n", |
|
" connection.commit()\n", |
|
" print(\"DONE!\")\n", |
|
" except Error as e:\n", |
|
" print(e)\n", |
|
" finally:\n", |
|
" connection.close()\n" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": 48, |
|
"metadata": {}, |
|
"outputs": [ |
|
{ |
|
"name": "stdout", |
|
"output_type": "stream", |
|
"text": [ |
|
"CONNECTED! <mysql.connector.connection_cext.CMySQLConnection object at 0x79c990b31430>\n", |
|
"DONE!\n" |
|
] |
|
} |
|
], |
|
"source": [ |
|
"db_insert(\n", |
|
" build_insert(\n", |
|
" create_lists(), 1, 1\n", |
|
" )\n", |
|
" )" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": 57, |
|
"metadata": {}, |
|
"outputs": [ |
|
{ |
|
"name": "stdout", |
|
"output_type": "stream", |
|
"text": [ |
|
"NEW EMPAR EMPREENDIMENT\n", |
|
"NAZO SUSHI BAR \n", |
|
"IFD*IFOOD.COM AGENCIA D\n", |
|
"IFD*RC MELO COMERCIO D \n", |
|
"GRUPO FARTURA DE HORTI \n", |
|
"IFD*JL COMERCIO VAREJIS\n", |
|
"MURAKAMI \n", |
|
"ZP *CANTINAGOODLANCHEV \n", |
|
"CANTINA E CIA \n", |
|
"CANTINA E CIA \n", |
|
"IFD*NFE COMERCIO DE ALI\n", |
|
"BENITA PANINOTECA \n", |
|
"IFD*BSQUARE PIZZA BURGE\n", |
|
"IFD*RC MELO COMERCIO DE\n", |
|
"CANTINA E CIA \n", |
|
"ZP *CANTINAGOODLANCHEV \n", |
|
"IFD*BSQUARE PIZZA BURGE\n", |
|
"DROGASIL 2067 \n", |
|
"REDE BRASIL DRUGSTORE \n", |
|
"PAGUE MENOS 1225 \n", |
|
"PAG*EduardoMeireles \n", |
|
"MR JOHN BARBEARIA LTDA \n", |
|
"NETFLIX.COM \n", |
|
"PAG*FolhaDeSPaulo \n", |
|
"IFD*CR EXPRESS \n", |
|
"APPLE.COM/BILL \n", |
|
"MERCADOLIVRE*MERCADOLIV\n", |
|
"MP*5PRODUTOS \n", |
|
"MERCADOLIVRE*SABORESDAM\n", |
|
"MP*MELIMAIS \n", |
|
"Wellhub Gympass BR Gymp\n", |
|
"IFD*CR EXPRESS \n", |
|
"MG LAVA JATO \n", |
|
"MERCADOLIVRE*GLDECOR \n", |
|
"MR JOHN BARBEARIA LTDA \n", |
|
"APPLE.COM/BILL \n", |
|
"IFD*SCORPIONS EXPRESS \n", |
|
"CASCOL COMBUSTIVEIS \n", |
|
"IFD*GRAN LOG EXPRESS \n", |
|
"SUPERAUTOR C*Supe \n", |
|
"UBER* TRIP \n", |
|
"UBER *TRIP HELP.UBER.CO\n", |
|
"UBER * PENDING \n", |
|
"UBER *TRIP HELP.UBER.CO\n", |
|
"VELOE \n", |
|
"POUSADA PIRENEUS RESOR \n", |
|
"DL*GOOGLE YouTub \n", |
|
"STEAMGAMES.COM 42595229\n", |
|
"IOF - COMPRA NO EXTERIO\n", |
|
"STEAM PURCHASE \n", |
|
"IOF - COMPRA NO EXTERIO\n", |
|
"PAG*XsollaGames \n", |
|
"PRODUTOS GLOB PARC 04/1\n", |
|
"MP*MUNDODOSCO PARC 07/1\n", |
|
"PAG*Folhadesp PARC 03/0\n", |
|
"BIANCHINI AUT PARC 10/1\n", |
|
"PARC=112 BRAS PARC 03/1\n", |
|
"SALTO CORUMBA \n", |
|
"PG *TON MINHACANTINA \n", |
|
"PASTELARIA VICOSA IV \n", |
|
"PASTELARIA VICOSA IV \n", |
|
"CANTINA E CIA \n", |
|
"MERCADOLIVRE*3PRODUTOS \n", |
|
"COFFEE BIKE CAFES ESPEC\n", |
|
"IFD*ARCOS DOURADOS COME\n", |
|
"RESTAURANTE FAROFINA \n", |
|
"PAG*DiogoLealPimenta \n", |
|
"TREVISO GALETERIA E P \n", |
|
"OLINDA COMIDA NORDESTI \n", |
|
"DULCE PATAGONIA \n", |
|
"HOT DOG CLUB \n", |
|
"RESTAURANTE SAO JOAO \n", |
|
"GELATO E GRANO \n", |
|
"F L L MELO LTDA \n", |
|
"CREMERIA ITALIANA \n", |
|
"RITUARIA*Rituaria \n", |
|
"BIOEXATA FARMACIA \n", |
|
"CASCOL COMBUSTIVEIS \n", |
|
"LIMBER SOFTWARE E CONS \n", |
|
"PAG*EduardoMeireles \n", |
|
"PAG*EduardoMeireles \n", |
|
"PAG*CidaRommanel \n", |
|
"ALLPARK EMPREENDIMENTOS\n", |
|
"MERCADOLIVRE*CHINALINK \n", |
|
"PG *S S MENDES COMERCI \n", |
|
"MERCADOLIVRE*3PRODUTOS \n", |
|
"WOW*SALE COMERCIO E SE \n", |
|
"PARENTELA PANIFICADORA \n", |
|
"FranciscoDeAssis \n", |
|
"BONNAPAN SEU DIA MAIS \n", |
|
"MP*BRILHODASARTE \n", |
|
"CARREFOUR PL2 338 \n", |
|
"UBER * PENDING \n", |
|
"UBER * PENDING \n", |
|
"UBER* TRIP \n", |
|
"UBER* TRIP \n", |
|
"UBER * PENDING \n", |
|
"UBER* TRIP \n", |
|
"UBER* TRIP \n", |
|
"UBER* TRIP \n", |
|
"UBER* TRIP \n", |
|
"UBER* TRIP \n", |
|
"UBER * PENDING \n", |
|
"CIDA REIS MODA FITNESS \n", |
|
"LANCHONETE SERRA RODO \n", |
|
"HOTEL GOYA P*hote \n", |
|
"BRASILIA EMPR PARC 07/1\n", |
|
"PG *B4A GLAMB PARC 08/1\n" |
|
] |
|
} |
|
], |
|
"source": [ |
|
"tLists = build_insert(create_lists(), 1, 1)\n", |
|
"for item in tLists:\n", |
|
" print(item[2])\n", |
|
" " |
|
] |
|
} |
|
], |
|
"metadata": { |
|
"kernelspec": { |
|
"display_name": "Python 3", |
|
"language": "python", |
|
"name": "python3" |
|
}, |
|
"language_info": { |
|
"codemirror_mode": { |
|
"name": "ipython", |
|
"version": 3 |
|
}, |
|
"file_extension": ".py", |
|
"mimetype": "text/x-python", |
|
"name": "python", |
|
"nbconvert_exporter": "python", |
|
"pygments_lexer": "ipython3", |
|
"version": "3.12.3" |
|
} |
|
}, |
|
"nbformat": 4, |
|
"nbformat_minor": 2 |
|
}
|
|
|