{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "test_file = 'OUROCARD_VISA_INFINITE-Ago_24.txt'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from pypdf import PdfReader\n", "\n", "reader = PdfReader('cc.pdf')\n", "number_of_pages = len(reader.pages)\n", "output = ''\n", "for i in range(number_of_pages):\n", " page = reader.pages[i]\n", " output += page.extract_text()\n", "print(output)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r') as reader:\n", " data = reader.read()\n", " print(data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import re\n", "\n", "# Open the text file\n", "with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r') as file:\n", " # Read the contents of the file\n", " contents = file.read()\n", "\n", "# Define the regex pattern to match\n", "pattern = r'\\d{2}\\.\\d{2}\\.\\d{4}.{23}.{14}.{2}\\s*\\d+,\\d{2}\\s*\\d+,\\d{2}'\n", "\n", "# Iterate over the lines that match the pattern\n", "for match in re.finditer(pattern, contents):\n", " print(match.group())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import re\n", "\n", "# Open the text file\n", "with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r') as file:\n", " # Read the contents of the file\n", " contents = file.read()\n", "\n", "# Define the regex pattern to match\n", "pattern = r'.*DANIEL.*|.*IZABELY.*'\n", "\n", "# Iterate over the lines that match the pattern\n", "for match in re.finditer(pattern, contents):\n", " print(match.group())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import re\n", "\n", "# Open the text file\n", "with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r') as file:\n", " # Read the contents of the file\n", " contents = file.read()\n", "\n", "# Define the regex patterns\n", "dan_pattern = r'*DANIEL.*'\n", "iza_pattern = r'.*IZABELY.*'\n", "line_pattern = r'\\d{2}\\.\\d{2}\\.\\d{4}.{23}.{14}.{2}\\s*\\d+,\\d{2}\\s*\\d+,\\d{2}'\n", "\n", "# Iterate over the lines that match the pattern\n", "for match in re.finditer(line_pattern, contents):\n", " print(match.group())\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Open the text file\n", "with open('table-test.txt', 'r') as file:\n", " # Read the contents of the file\n", " contents = file.readlines()\n", "\n", "# Initialize lists to store the lines under each table\n", "table_a_lines = []\n", "table_b_lines = []\n", "\n", "# Flag to determine which table section we are in\n", "current_table = None\n", "\n", "# Iterate over the lines in the file\n", "for line in contents:\n", " line = line.strip() # Remove leading and trailing whitespace\n", "\n", " # Check for TABLEA and TABLEB\n", " if line == 'TABLEA':\n", " current_table = 'TABLEA'\n", " elif line == 'TABLEB':\n", " current_table = 'TABLEB'\n", " else:\n", " # Add lines to the appropriate list based on the current table\n", " if current_table == 'TABLEA':\n", " table_a_lines.append(line)\n", " elif current_table == 'TABLEB':\n", " table_b_lines.append(line)\n", "\n", "# Print the results\n", "print('Lines under TABLEA:')\n", "for data in table_a_lines:\n", " print(data)\n", "\n", "print('\\nLines under TABLEB:')\n", "for data in table_b_lines:\n", " print(data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import re\n", "from datetime import date, datetime\n", "import locale\n", "\n", "locale.setlocale(locale.LC_ALL, 'pt_BR.UTF-8')\n", "\n", "# Open the text file\n", "with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r', encoding='latin') as file:\n", " # Read the contents of the file\n", " contents = file.readlines()\n", "\n", "# Define the regex patterns\n", "dan_pattern = r'1 - DANIEL.*'\n", "iza_pattern = r'4 - IZABELY.*'\n", "line_pattern = r'\\d{2}\\.\\d{2}\\.\\d{4}.{23}.{14}.{2}\\s*\\d+,\\d{2}\\s*\\d+,\\d{2}'\n", "line_group_pattern = r'(\\d{2})\\.(\\d{2})\\.(\\d{4})(.{23})(.{14})(.{2})(\\s*\\d+,\\d{2})(\\s*\\d+,\\d{2})'\n", "\n", "# Lists\n", "list_dan = []\n", "list_iza = []\n", "current_list = None\n", "\n", "insert_bulk = []\n", "\n", "# Iterate all lines\n", "for line in contents:\n", " line = line.strip()\n", " if re.match(dan_pattern, line):\n", " current_list = 'list_dan'\n", " print('found Dan')\n", " elif re.match(iza_pattern, line):\n", " current_list = 'list_iza'\n", " print('found Iza')\n", " else:\n", " if re.match(line_pattern, line):\n", " if current_list == 'list_dan':\n", " print(\"dan\", line)\n", " list_dan.append(line)\n", " if current_list == 'list_iza':\n", " print(\"iza\", line)\n", " list_iza.append(line)\n", "\n", "print('list_dan - tuples for insert')\n", "for item in list_dan:\n", " match = re.search(line_group_pattern, item)\n", " tTdate = str(date(int(match.group(3)), int(match.group(2)), int(match.group(1))))\n", " tAccount = 1\n", " tMemo = match.group(4)\n", " tCity = match.group(5)\n", " tCountry = match.group(6)\n", " tOutflow = match.group(7).strip().replace(',', '.')\n", " tInflow = match.group(8).strip().replace(',', '.')\n", " tOwner = 1\n", " tInstallments = 1\n", " tCreated = str(datetime.now(tz=None))\n", " tUpdated = None\n", " insert_bulk.append(( tTdate, tAccount, tMemo, tCity, tCountry, tOutflow, tInflow, tOwner, tInstallments, tCreated, tUpdated ))\n", "\n", "print('list_dan - tuples for insert')\n", "for item in list_iza:\n", " match = re.search(line_group_pattern, item)\n", " tTdate = str(date(int(match.group(3)), int(match.group(2)), int(match.group(1))))\n", " tAccount = 1\n", " tMemo = match.group(4)\n", " tCity = match.group(5)\n", " tCountry = match.group(6)\n", " tOutflow = match.group(7).strip().replace(',', '.')\n", " tInflow = match.group(8).strip().replace(',', '.')\n", " tOwner = 2\n", " tInstallments = 1\n", " tCreated = str(datetime.now(tz=None))\n", " tUpdated = None\n", " insert_bulk.append(( tTdate, tAccount, tMemo, tCity, tCountry, tOutflow, tInflow, tOwner, tInstallments, tCreated, tUpdated ))\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "insert_query =\"INSERT INTO default.TRANSACTION (TDATE, ACCOUNTID, MEMO, CITY, COUNTRY, OUTFLOW, INFLOW, OWNERID, INSTALLMENTS, CREATED, UPDATED) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)\"" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "def create_lists():\n", " import re\n", "\n", " # Open the text file\n", " with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r', encoding='latin') as file:\n", " # Read the contents of the file\n", " contents = file.readlines()\n", "\n", " # Define the regex patterns\n", " dan_pattern = r'1 - DANIEL.*'\n", " iza_pattern = r'4 - IZABELY.*'\n", " line_pattern = r'\\d{2}\\.\\d{2}\\.\\d{4}.{23}.{14}.{2}\\s*\\d+,\\d{2}\\s*\\d+,\\d{2}'\n", "\n", " # Lists\n", " list_dan = []\n", " list_iza = []\n", " current_list = None\n", "\n", " # Iterate all lines\n", " for line in contents:\n", " line = line.strip()\n", " if re.match(dan_pattern, line):\n", " current_list = 'list_dan'\n", " elif re.match(iza_pattern, line):\n", " current_list = 'list_iza'\n", " else:\n", " if re.match(line_pattern, line):\n", " if current_list == 'list_dan':\n", " list_dan.append(line)\n", " if current_list == 'list_iza':\n", " list_iza.append(line)\n", " \n", " return [list_dan, list_iza]" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "def build_insert(lists: list[list, list], account: int, owner: int):\n", " from datetime import date, datetime\n", "\n", " insert_bulk = []\n", " line_group_pattern = r'(\\d{2})\\.(\\d{2})\\.(\\d{4})(.{23})(.{14})(.{2})(\\s*\\d+,\\d{2})(\\s*\\d+,\\d{2})'\n", "\n", " for batch in lists:\n", " for item in batch:\n", " match = re.search(line_group_pattern, item)\n", " tTdate = str(date(int(match.group(3)), int(match.group(2)), int(match.group(1))))\n", " tAccount = account\n", " tMemo = match.group(4)\n", " tCity = match.group(5)\n", " tCountry = match.group(6)\n", " tOutflow = match.group(7).strip().replace(',', '.')\n", " tInflow = match.group(8).strip().replace(',', '.')\n", " tOwner = owner\n", " tInstallments = 1\n", " tCreated = str(datetime.now(tz=None))\n", " tUpdated = None\n", " insert_bulk.append(( tTdate, tAccount, tMemo, tCity, tCountry, tOutflow, tInflow, tOwner, tInstallments, tCreated, tUpdated ))\n", " \n", " return insert_bulk" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "def db_insert(insert_bulk: list[tuple]):\n", " from mysql.connector import connect, Error\n", "\n", " try:\n", " with connect(\n", " host='localhost',\n", " user='root',\n", " password='pleasehashapasswordomg',\n", " database='default'\n", " ) as connection:\n", " print(\"CONNECTED!\", connection)\n", " with connection.cursor() as cursor:\n", " cursor.executemany(insert_query, insert_bulk)\n", " connection.commit()\n", " print(\"DONE!\")\n", " except Error as e:\n", " print(e)\n", " finally:\n", " connection.close()\n" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CONNECTED! \n", "DONE!\n" ] } ], "source": [ "db_insert(\n", " build_insert(\n", " create_lists(), 1, 1\n", " )\n", " )" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NEW EMPAR EMPREENDIMENT\n", "NAZO SUSHI BAR \n", "IFD*IFOOD.COM AGENCIA D\n", "IFD*RC MELO COMERCIO D \n", "GRUPO FARTURA DE HORTI \n", "IFD*JL COMERCIO VAREJIS\n", "MURAKAMI \n", "ZP *CANTINAGOODLANCHEV \n", "CANTINA E CIA \n", "CANTINA E CIA \n", "IFD*NFE COMERCIO DE ALI\n", "BENITA PANINOTECA \n", "IFD*BSQUARE PIZZA BURGE\n", "IFD*RC MELO COMERCIO DE\n", "CANTINA E CIA \n", "ZP *CANTINAGOODLANCHEV \n", "IFD*BSQUARE PIZZA BURGE\n", "DROGASIL 2067 \n", "REDE BRASIL DRUGSTORE \n", "PAGUE MENOS 1225 \n", "PAG*EduardoMeireles \n", "MR JOHN BARBEARIA LTDA \n", "NETFLIX.COM \n", "PAG*FolhaDeSPaulo \n", "IFD*CR EXPRESS \n", "APPLE.COM/BILL \n", "MERCADOLIVRE*MERCADOLIV\n", "MP*5PRODUTOS \n", "MERCADOLIVRE*SABORESDAM\n", "MP*MELIMAIS \n", "Wellhub Gympass BR Gymp\n", "IFD*CR EXPRESS \n", "MG LAVA JATO \n", "MERCADOLIVRE*GLDECOR \n", "MR JOHN BARBEARIA LTDA \n", "APPLE.COM/BILL \n", "IFD*SCORPIONS EXPRESS \n", "CASCOL COMBUSTIVEIS \n", "IFD*GRAN LOG EXPRESS \n", "SUPERAUTOR C*Supe \n", "UBER* TRIP \n", "UBER *TRIP HELP.UBER.CO\n", "UBER * PENDING \n", "UBER *TRIP HELP.UBER.CO\n", "VELOE \n", "POUSADA PIRENEUS RESOR \n", "DL*GOOGLE YouTub \n", "STEAMGAMES.COM 42595229\n", "IOF - COMPRA NO EXTERIO\n", "STEAM PURCHASE \n", "IOF - COMPRA NO EXTERIO\n", "PAG*XsollaGames \n", "PRODUTOS GLOB PARC 04/1\n", "MP*MUNDODOSCO PARC 07/1\n", "PAG*Folhadesp PARC 03/0\n", "BIANCHINI AUT PARC 10/1\n", "PARC=112 BRAS PARC 03/1\n", "SALTO CORUMBA \n", "PG *TON MINHACANTINA \n", "PASTELARIA VICOSA IV \n", "PASTELARIA VICOSA IV \n", "CANTINA E CIA \n", "MERCADOLIVRE*3PRODUTOS \n", "COFFEE BIKE CAFES ESPEC\n", "IFD*ARCOS DOURADOS COME\n", "RESTAURANTE FAROFINA \n", "PAG*DiogoLealPimenta \n", "TREVISO GALETERIA E P \n", "OLINDA COMIDA NORDESTI \n", "DULCE PATAGONIA \n", "HOT DOG CLUB \n", "RESTAURANTE SAO JOAO \n", "GELATO E GRANO \n", "F L L MELO LTDA \n", "CREMERIA ITALIANA \n", "RITUARIA*Rituaria \n", "BIOEXATA FARMACIA \n", "CASCOL COMBUSTIVEIS \n", "LIMBER SOFTWARE E CONS \n", "PAG*EduardoMeireles \n", "PAG*EduardoMeireles \n", "PAG*CidaRommanel \n", "ALLPARK EMPREENDIMENTOS\n", "MERCADOLIVRE*CHINALINK \n", "PG *S S MENDES COMERCI \n", "MERCADOLIVRE*3PRODUTOS \n", "WOW*SALE COMERCIO E SE \n", "PARENTELA PANIFICADORA \n", "FranciscoDeAssis \n", "BONNAPAN SEU DIA MAIS \n", "MP*BRILHODASARTE \n", "CARREFOUR PL2 338 \n", "UBER * PENDING \n", "UBER * PENDING \n", "UBER* TRIP \n", "UBER* TRIP \n", "UBER * PENDING \n", "UBER* TRIP \n", "UBER* TRIP \n", "UBER* TRIP \n", "UBER* TRIP \n", "UBER* TRIP \n", "UBER * PENDING \n", "CIDA REIS MODA FITNESS \n", "LANCHONETE SERRA RODO \n", "HOTEL GOYA P*hote \n", "BRASILIA EMPR PARC 07/1\n", "PG *B4A GLAMB PARC 08/1\n" ] } ], "source": [ "tLists = build_insert(create_lists(), 1, 1)\n", "for item in tLists:\n", " print(item[2])\n", " " ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 2 }