robopato/pdfScrape.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_file = 'OUROCARD_VISA_INFINITE-Ago_24.txt'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pypdf import PdfReader\n",
    "\n",
    "reader = PdfReader('cc.pdf')\n",
    "number_of_pages = len(reader.pages)\n",
    "output = ''\n",
    "for i in range(number_of_pages):\n",
    "    page = reader.pages[i]\n",
    "    output += page.extract_text()\n",
    "print(output)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r') as reader:\n",
    "    data = reader.read()\n",
    "    print(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "# Open the text file\n",
    "with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r') as file:\n",
    "    # Read the contents of the file\n",
    "    contents = file.read()\n",
    "\n",
    "# Define the regex pattern to match\n",
    "pattern = r'\\d{2}\\.\\d{2}\\.\\d{4}.{23}.{14}.{2}\\s*\\d+,\\d{2}\\s*\\d+,\\d{2}'\n",
    "\n",
    "# Iterate over the lines that match the pattern\n",
    "for match in re.finditer(pattern, contents):\n",
    "    print(match.group())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "# Open the text file\n",
    "with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r') as file:\n",
    "    # Read the contents of the file\n",
    "    contents = file.read()\n",
    "\n",
    "# Define the regex pattern to match\n",
    "pattern = r'.*DANIEL.*|.*IZABELY.*'\n",
    "\n",
    "# Iterate over the lines that match the pattern\n",
    "for match in re.finditer(pattern, contents):\n",
    "    print(match.group())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "# Open the text file\n",
    "with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r') as file:\n",
    "    # Read the contents of the file\n",
    "    contents = file.read()\n",
    "\n",
    "# Define the regex patterns\n",
    "dan_pattern = r'*DANIEL.*'\n",
    "iza_pattern = r'.*IZABELY.*'\n",
    "line_pattern = r'\\d{2}\\.\\d{2}\\.\\d{4}.{23}.{14}.{2}\\s*\\d+,\\d{2}\\s*\\d+,\\d{2}'\n",
    "\n",
    "# Iterate over the lines that match the pattern\n",
    "for match in re.finditer(line_pattern, contents):\n",
    "    print(match.group())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Open the text file\n",
    "with open('table-test.txt', 'r') as file:\n",
    "    # Read the contents of the file\n",
    "    contents = file.readlines()\n",
    "\n",
    "# Initialize lists to store the lines under each table\n",
    "table_a_lines = []\n",
    "table_b_lines = []\n",
    "\n",
    "# Flag to determine which table section we are in\n",
    "current_table = None\n",
    "\n",
    "# Iterate over the lines in the file\n",
    "for line in contents:\n",
    "    line = line.strip()  # Remove leading and trailing whitespace\n",
    "\n",
    "    # Check for TABLEA and TABLEB\n",
    "    if line == 'TABLEA':\n",
    "        current_table = 'TABLEA'\n",
    "    elif line == 'TABLEB':\n",
    "        current_table = 'TABLEB'\n",
    "    else:\n",
    "        # Add lines to the appropriate list based on the current table\n",
    "        if current_table == 'TABLEA':\n",
    "            table_a_lines.append(line)\n",
    "        elif current_table == 'TABLEB':\n",
    "            table_b_lines.append(line)\n",
    "\n",
    "# Print the results\n",
    "print('Lines under TABLEA:')\n",
    "for data in table_a_lines:\n",
    "    print(data)\n",
    "\n",
    "print('\\nLines under TABLEB:')\n",
    "for data in table_b_lines:\n",
    "    print(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "from datetime import date, datetime\n",
    "import locale\n",
    "\n",
    "locale.setlocale(locale.LC_ALL, 'pt_BR.UTF-8')\n",
    "\n",
    "# Open the text file\n",
    "with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r', encoding='latin') as file:\n",
    "    # Read the contents of the file\n",
    "    contents = file.readlines()\n",
    "\n",
    "# Define the regex patterns\n",
    "dan_pattern = r'1 - DANIEL.*'\n",
    "iza_pattern = r'4 - IZABELY.*'\n",
    "line_pattern = r'\\d{2}\\.\\d{2}\\.\\d{4}.{23}.{14}.{2}\\s*\\d+,\\d{2}\\s*\\d+,\\d{2}'\n",
    "line_group_pattern = r'(\\d{2})\\.(\\d{2})\\.(\\d{4})(.{23})(.{14})(.{2})(\\s*\\d+,\\d{2})(\\s*\\d+,\\d{2})'\n",
    "\n",
    "# Lists\n",
    "list_dan = []\n",
    "list_iza = []\n",
    "current_list = None\n",
    "\n",
    "insert_bulk = []\n",
    "\n",
    "# Iterate all lines\n",
    "for line in contents:\n",
    "    line = line.strip()\n",
    "    if re.match(dan_pattern, line):\n",
    "        current_list = 'list_dan'\n",
    "        print('found Dan')\n",
    "    elif re.match(iza_pattern, line):\n",
    "        current_list = 'list_iza'\n",
    "        print('found Iza')\n",
    "    else:\n",
    "        if re.match(line_pattern, line):\n",
    "            if current_list == 'list_dan':\n",
    "                print(\"dan\", line)\n",
    "                list_dan.append(line)\n",
    "            if current_list == 'list_iza':\n",
    "                print(\"iza\", line)\n",
    "                list_iza.append(line)\n",
    "\n",
    "print('list_dan - tuples for insert')\n",
    "for item in list_dan:\n",
    "    match = re.search(line_group_pattern, item)\n",
    "    tTdate = str(date(int(match.group(3)), int(match.group(2)), int(match.group(1))))\n",
    "    tAccount = 1\n",
    "    tMemo = match.group(4)\n",
    "    tCity = match.group(5)\n",
    "    tCountry = match.group(6)\n",
    "    tOutflow = match.group(7).strip().replace(',', '.')\n",
    "    tInflow = match.group(8).strip().replace(',', '.')\n",
    "    tOwner = 1\n",
    "    tInstallments = 1\n",
    "    tCreated = str(datetime.now(tz=None))\n",
    "    tUpdated = None\n",
    "    insert_bulk.append(( tTdate, tAccount, tMemo, tCity, tCountry, tOutflow, tInflow, tOwner, tInstallments, tCreated, tUpdated ))\n",
    "\n",
    "print('list_dan - tuples for insert')\n",
    "for item in list_iza:\n",
    "    match = re.search(line_group_pattern, item)\n",
    "    tTdate = str(date(int(match.group(3)), int(match.group(2)), int(match.group(1))))\n",
    "    tAccount = 1\n",
    "    tMemo = match.group(4)\n",
    "    tCity = match.group(5)\n",
    "    tCountry = match.group(6)\n",
    "    tOutflow = match.group(7).strip().replace(',', '.')\n",
    "    tInflow = match.group(8).strip().replace(',', '.')\n",
    "    tOwner = 2\n",
    "    tInstallments = 1\n",
    "    tCreated = str(datetime.now(tz=None))\n",
    "    tUpdated = None\n",
    "    insert_bulk.append(( tTdate, tAccount, tMemo, tCity, tCountry, tOutflow, tInflow, tOwner, tInstallments, tCreated, tUpdated ))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "insert_query =\"INSERT INTO default.TRANSACTION (TDATE, ACCOUNTID, MEMO, CITY, COUNTRY, OUTFLOW, INFLOW, OWNERID, INSTALLMENTS, CREATED, UPDATED) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_lists():\n",
    "    import re\n",
    "\n",
    "    # Open the text file\n",
    "    with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r', encoding='latin') as file:\n",
    "        # Read the contents of the file\n",
    "        contents = file.readlines()\n",
    "\n",
    "    # Define the regex patterns\n",
    "    dan_pattern = r'1 - DANIEL.*'\n",
    "    iza_pattern = r'4 - IZABELY.*'\n",
    "    line_pattern = r'\\d{2}\\.\\d{2}\\.\\d{4}.{23}.{14}.{2}\\s*\\d+,\\d{2}\\s*\\d+,\\d{2}'\n",
    "\n",
    "    # Lists\n",
    "    list_dan = []\n",
    "    list_iza = []\n",
    "    current_list = None\n",
    "\n",
    "    # Iterate all lines\n",
    "    for line in contents:\n",
    "        line = line.strip()\n",
    "        if re.match(dan_pattern, line):\n",
    "            current_list = 'list_dan'\n",
    "        elif re.match(iza_pattern, line):\n",
    "            current_list = 'list_iza'\n",
    "        else:\n",
    "            if re.match(line_pattern, line):\n",
    "                if current_list == 'list_dan':\n",
    "                    list_dan.append(line)\n",
    "                if current_list == 'list_iza':\n",
    "                    list_iza.append(line)\n",
    "            \n",
    "    return [list_dan, list_iza]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_insert(lists: list[list, list], account: int, owner: int):\n",
    "    from datetime import date, datetime\n",
    "\n",
    "    insert_bulk = []\n",
    "    line_group_pattern = r'(\\d{2})\\.(\\d{2})\\.(\\d{4})(.{23})(.{14})(.{2})(\\s*\\d+,\\d{2})(\\s*\\d+,\\d{2})'\n",
    "\n",
    "    for batch in lists:\n",
    "        for item in batch:\n",
    "            match = re.search(line_group_pattern, item)\n",
    "            tTdate = str(date(int(match.group(3)), int(match.group(2)), int(match.group(1))))\n",
    "            tAccount = account\n",
    "            tMemo = match.group(4)\n",
    "            tCity = match.group(5)\n",
    "            tCountry = match.group(6)\n",
    "            tOutflow = match.group(7).strip().replace(',', '.')\n",
    "            tInflow = match.group(8).strip().replace(',', '.')\n",
    "            tOwner = owner\n",
    "            tInstallments = 1\n",
    "            tCreated = str(datetime.now(tz=None))\n",
    "            tUpdated = None\n",
    "            insert_bulk.append(( tTdate, tAccount, tMemo, tCity, tCountry, tOutflow, tInflow, tOwner, tInstallments, tCreated, tUpdated ))\n",
    "    \n",
    "    return insert_bulk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "def db_insert(insert_bulk: list[tuple]):\n",
    "    from mysql.connector import connect, Error\n",
    "\n",
    "    try:\n",
    "        with connect(\n",
    "            host='localhost',\n",
    "            user='root',\n",
    "            password='pleasehashapasswordomg',\n",
    "            database='default'\n",
    "        ) as connection:\n",
    "            print(\"CONNECTED!\", connection)\n",
    "            with connection.cursor() as cursor:\n",
    "                cursor.executemany(insert_query, insert_bulk)\n",
    "            connection.commit()\n",
    "            print(\"DONE!\")\n",
    "    except Error as e:\n",
    "        print(e)\n",
    "    finally:\n",
    "        connection.close()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CONNECTED! <mysql.connector.connection_cext.CMySQLConnection object at 0x79c990b31430>\n",
      "DONE!\n"
     ]
    }
   ],
   "source": [
    "db_insert(\n",
    "    build_insert(\n",
    "        create_lists(), 1, 1\n",
    "        )\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "NEW EMPAR EMPREENDIMENT\n",
      "NAZO SUSHI BAR         \n",
      "IFD*IFOOD.COM AGENCIA D\n",
      "IFD*RC MELO COMERCIO D \n",
      "GRUPO FARTURA DE HORTI \n",
      "IFD*JL COMERCIO VAREJIS\n",
      "MURAKAMI               \n",
      "ZP *CANTINAGOODLANCHEV \n",
      "CANTINA E CIA          \n",
      "CANTINA E CIA          \n",
      "IFD*NFE COMERCIO DE ALI\n",
      "BENITA PANINOTECA      \n",
      "IFD*BSQUARE PIZZA BURGE\n",
      "IFD*RC MELO COMERCIO DE\n",
      "CANTINA E CIA          \n",
      "ZP *CANTINAGOODLANCHEV \n",
      "IFD*BSQUARE PIZZA BURGE\n",
      "DROGASIL 2067          \n",
      "REDE BRASIL DRUGSTORE  \n",
      "PAGUE MENOS 1225       \n",
      "PAG*EduardoMeireles    \n",
      "MR JOHN BARBEARIA LTDA \n",
      "NETFLIX.COM            \n",
      "PAG*FolhaDeSPaulo      \n",
      "IFD*CR EXPRESS         \n",
      "APPLE.COM/BILL         \n",
      "MERCADOLIVRE*MERCADOLIV\n",
      "MP*5PRODUTOS           \n",
      "MERCADOLIVRE*SABORESDAM\n",
      "MP*MELIMAIS            \n",
      "Wellhub Gympass BR Gymp\n",
      "IFD*CR EXPRESS         \n",
      "MG LAVA JATO           \n",
      "MERCADOLIVRE*GLDECOR   \n",
      "MR JOHN BARBEARIA LTDA \n",
      "APPLE.COM/BILL         \n",
      "IFD*SCORPIONS EXPRESS  \n",
      "CASCOL COMBUSTIVEIS    \n",
      "IFD*GRAN LOG EXPRESS   \n",
      "SUPERAUTOR C*Supe      \n",
      "UBER* TRIP             \n",
      "UBER *TRIP HELP.UBER.CO\n",
      "UBER * PENDING         \n",
      "UBER *TRIP HELP.UBER.CO\n",
      "VELOE                  \n",
      "POUSADA PIRENEUS RESOR \n",
      "DL*GOOGLE YouTub       \n",
      "STEAMGAMES.COM 42595229\n",
      "IOF - COMPRA NO EXTERIO\n",
      "STEAM PURCHASE         \n",
      "IOF - COMPRA NO EXTERIO\n",
      "PAG*XsollaGames        \n",
      "PRODUTOS GLOB PARC 04/1\n",
      "MP*MUNDODOSCO PARC 07/1\n",
      "PAG*Folhadesp PARC 03/0\n",
      "BIANCHINI AUT PARC 10/1\n",
      "PARC=112 BRAS PARC 03/1\n",
      "SALTO CORUMBA          \n",
      "PG *TON MINHACANTINA   \n",
      "PASTELARIA VICOSA IV   \n",
      "PASTELARIA VICOSA IV   \n",
      "CANTINA E CIA          \n",
      "MERCADOLIVRE*3PRODUTOS \n",
      "COFFEE BIKE CAFES ESPEC\n",
      "IFD*ARCOS DOURADOS COME\n",
      "RESTAURANTE FAROFINA   \n",
      "PAG*DiogoLealPimenta   \n",
      "TREVISO GALETERIA E P  \n",
      "OLINDA COMIDA NORDESTI \n",
      "DULCE PATAGONIA        \n",
      "HOT DOG CLUB           \n",
      "RESTAURANTE SAO JOAO   \n",
      "GELATO E GRANO         \n",
      "F L L MELO LTDA        \n",
      "CREMERIA ITALIANA      \n",
      "RITUARIA*Rituaria      \n",
      "BIOEXATA FARMACIA      \n",
      "CASCOL COMBUSTIVEIS    \n",
      "LIMBER SOFTWARE E CONS \n",
      "PAG*EduardoMeireles    \n",
      "PAG*EduardoMeireles    \n",
      "PAG*CidaRommanel       \n",
      "ALLPARK EMPREENDIMENTOS\n",
      "MERCADOLIVRE*CHINALINK \n",
      "PG *S S MENDES COMERCI \n",
      "MERCADOLIVRE*3PRODUTOS \n",
      "WOW*SALE COMERCIO E SE \n",
      "PARENTELA PANIFICADORA \n",
      "FranciscoDeAssis       \n",
      "BONNAPAN SEU DIA MAIS  \n",
      "MP*BRILHODASARTE       \n",
      "CARREFOUR PL2 338      \n",
      "UBER * PENDING         \n",
      "UBER * PENDING         \n",
      "UBER* TRIP             \n",
      "UBER* TRIP             \n",
      "UBER * PENDING         \n",
      "UBER* TRIP             \n",
      "UBER* TRIP             \n",
      "UBER* TRIP             \n",
      "UBER* TRIP             \n",
      "UBER* TRIP             \n",
      "UBER * PENDING         \n",
      "CIDA REIS MODA FITNESS \n",
      "LANCHONETE SERRA RODO  \n",
      "HOTEL GOYA P*hote      \n",
      "BRASILIA EMPR PARC 07/1\n",
      "PG *B4A GLAMB PARC 08/1\n"
     ]
    }
   ],
   "source": [
    "tLists = build_insert(create_lists(), 1, 1)\n",
    "for item in tLists:\n",
    "    print(item[2])\n",
    "    "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}