robopato/pdfScrape.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_file = 'OUROCARD_VISA_INFINITE-Ago_24.txt'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r') as reader:\n",
    "    data = reader.read()\n",
    "    print(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "# Open the text file\n",
    "with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r') as file:\n",
    "    # Read the contents of the file\n",
    "    contents = file.read()\n",
    "\n",
    "# Define the regex pattern to match\n",
    "pattern = r'\\d{2}\\.\\d{2}\\.\\d{4}.{23}.{14}.{2}\\s*\\d+,\\d{2}\\s*\\d+,\\d{2}'\n",
    "\n",
    "# Iterate over the lines that match the pattern\n",
    "for matches in re.finditer(pattern, contents):\n",
    "    print(matches.group())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "# Open the text file\n",
    "with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r') as file:\n",
    "    # Read the contents of the file\n",
    "    contents = file.read()\n",
    "\n",
    "# Define the regex pattern to match\n",
    "pattern = r'.*DANIEL.*|.*IZABELY.*'\n",
    "\n",
    "# Iterate over the lines that match the pattern\n",
    "for matches in re.finditer(pattern, contents):\n",
    "    print(matches.group())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "# Open the text file\n",
    "with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r') as file:\n",
    "    # Read the contents of the file\n",
    "    contents = file.read()\n",
    "\n",
    "# Define the regex patterns\n",
    "dan_pattern = r'*DANIEL.*'\n",
    "iza_pattern = r'.*IZABELY.*'\n",
    "line_pattern = r'\\d{2}\\.\\d{2}\\.\\d{4}.{23}.{14}.{2}\\s*\\d+,\\d{2}\\s*\\d+,\\d{2}'\n",
    "\n",
    "# Iterate over the lines that match the pattern\n",
    "for matches in re.finditer(line_pattern, contents):\n",
    "    print(matches.group())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Open the text file\n",
    "with open('table-test.txt', 'r') as file:\n",
    "    # Read the contents of the file\n",
    "    contents = file.readlines()\n",
    "\n",
    "# Initialize lists to store the lines under each table\n",
    "table_a_lines = []\n",
    "table_b_lines = []\n",
    "\n",
    "# Flag to determine which table section we are in\n",
    "current_table = None\n",
    "\n",
    "# Iterate over the lines in the file\n",
    "for line in contents:\n",
    "    line = line.strip()  # Remove leading and trailing whitespace\n",
    "\n",
    "    # Check for TABLEA and TABLEB\n",
    "    if line == 'TABLEA':\n",
    "        current_table = 'TABLEA'\n",
    "    elif line == 'TABLEB':\n",
    "        current_table = 'TABLEB'\n",
    "    else:\n",
    "        # Add lines to the appropriate list based on the current table\n",
    "        if current_table == 'TABLEA':\n",
    "            table_a_lines.append(line)\n",
    "        elif current_table == 'TABLEB':\n",
    "            table_b_lines.append(line)\n",
    "\n",
    "# Print the results\n",
    "print('Lines under TABLEA:')\n",
    "for data in table_a_lines:\n",
    "    print(data)\n",
    "\n",
    "print('\\nLines under TABLEB:')\n",
    "for data in table_b_lines:\n",
    "    print(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "from datetime import date, datetime\n",
    "import locale\n",
    "\n",
    "locale.setlocale(locale.LC_ALL, 'pt_BR.UTF-8')\n",
    "\n",
    "# Open the text file\n",
    "with open('OUROCARD_VISA_INFINITE-Ago_24.txt', 'r', encoding='latin') as file:\n",
    "    # Read the contents of the file\n",
    "    contents = file.readlines()\n",
    "\n",
    "# Define the regex patterns\n",
    "dan_pattern = r'1 - DANIEL.*'\n",
    "iza_pattern = r'4 - IZABELY.*'\n",
    "line_pattern = r'\\d{2}\\.\\d{2}\\.\\d{4}.{23}.{14}.{2}\\s*\\d+,\\d{2}\\s*\\d+,\\d{2}'\n",
    "line_group_pattern = r'(\\d{2})\\.(\\d{2})\\.(\\d{4})(.{23})(.{14})(.{2})(\\s*\\d+,\\d{2})(\\s*\\d+,\\d{2})'\n",
    "\n",
    "# Lists\n",
    "list_dan = []\n",
    "list_iza = []\n",
    "current_list = None\n",
    "\n",
    "insert_bulk = []\n",
    "\n",
    "# Iterate all lines\n",
    "for line in contents:\n",
    "    line = line.strip()\n",
    "    if re.match(dan_pattern, line):\n",
    "        current_list = 'list_dan'\n",
    "        print('found Dan')\n",
    "    elif re.match(iza_pattern, line):\n",
    "        current_list = 'list_iza'\n",
    "        print('found Iza')\n",
    "    else:\n",
    "        if re.match(line_pattern, line):\n",
    "            if current_list == 'list_dan':\n",
    "                print(\"dan\", line)\n",
    "                list_dan.append(line)\n",
    "            if current_list == 'list_iza':\n",
    "                print(\"iza\", line)\n",
    "                list_iza.append(line)\n",
    "\n",
    "print('list_dan - tuples for insert')\n",
    "for item in list_dan:\n",
    "    matches = re.search(line_group_pattern, item)\n",
    "    tTdate = str(date(int(matches.group(3)), int(matches.group(2)), int(matches.group(1))))\n",
    "    tAccount = 1\n",
    "    tMemo = matches.group(4)\n",
    "    tCity = matches.group(5)\n",
    "    tCountry = matches.group(6)\n",
    "    tOutflow = matches.group(7).strip().replace(',', '.')\n",
    "    tInflow = matches.group(8).strip().replace(',', '.')\n",
    "    tOwner = 1\n",
    "    tInstallments = 1\n",
    "    tCreated = str(datetime.now(tz=None))\n",
    "    tUpdated = None\n",
    "    insert_bulk.append(( tTdate, tAccount, tMemo, tCity, tCountry, tOutflow, tInflow, tOwner, tInstallments, tCreated, tUpdated ))\n",
    "\n",
    "print('list_dan - tuples for insert')\n",
    "for item in list_iza:\n",
    "    matches = re.search(line_group_pattern, item)\n",
    "    tTdate = str(date(int(matches.group(3)), int(matches.group(2)), int(matches.group(1))))\n",
    "    tAccount = 1\n",
    "    tMemo = matches.group(4)\n",
    "    tCity = matches.group(5)\n",
    "    tCountry = matches.group(6)\n",
    "    tOutflow = matches.group(7).strip().replace(',', '.')\n",
    "    tInflow = matches.group(8).strip().replace(',', '.')\n",
    "    tOwner = 2\n",
    "    tInstallments = 1\n",
    "    tCreated = str(datetime.now(tz=None))\n",
    "    tUpdated = None\n",
    "    insert_bulk.append(( tTdate, tAccount, tMemo, tCity, tCountry, tOutflow, tInflow, tOwner, tInstallments, tCreated, tUpdated ))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "insert_query = \"INSERT INTO default.TRANSACTION (TDATE, ACCOUNTID, MEMO, CITY, COUNTRY, OUTFLOW, INFLOW, OWNERID, INSTALLMENT_NR, INSTALLMENT_TT, CREATED, UPDATED) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s )\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_lists():\n",
    "    import re\n",
    "\n",
    "    # Open the text file\n",
    "    with open(\"OUROCARD_VISA_INFINITE-Ago_24.txt\", \"r\", encoding=\"latin\") as file:\n",
    "        # Read the contents of the file\n",
    "        contents = file.readlines()\n",
    "\n",
    "    # Define the regex patterns\n",
    "    owner_pattern = r\"\\d\\s*-\\s*(\\w+)\"\n",
    "    line_pattern = r\"\\d{2}\\.\\d{2}\\.\\d{4}.{23}.{14}.{2}\\s*-?\\d*\\.?\\d+,\\d{2}\\s*\\d+,\\d{2}\"\n",
    "    payment_pattern = (r\"\\d{2}\\.\\d{2}\\.\\d{4}PGTO.*200211(\\s*-?\\d*\\.?\\d+,\\d{2})(\\s*\\d+,\\d{2})\")\n",
    "\n",
    "    # Lists\n",
    "    current_list = None\n",
    "    owner_list = []\n",
    "    result = {}\n",
    "\n",
    "    silly_counter = 1\n",
    "\n",
    "    # Find Owners\n",
    "    for line in contents:\n",
    "        line = line.strip()\n",
    "\n",
    "        found_owners = re.findall(owner_pattern, line)\n",
    "        if found_owners:\n",
    "            for owner_name in found_owners:\n",
    "                list_name = f\"list_{owner_name.lower()}\"\n",
    "                owner_list.append(list_name)\n",
    "                result[list_name] = {}\n",
    "                result[list_name][\"owner_name\"] = owner_name\n",
    "                result[list_name][\"owner_id\"] = silly_counter\n",
    "                silly_counter = silly_counter + 1\n",
    "\n",
    "    for line in contents:\n",
    "        line = line.strip()\n",
    "\n",
    "        if re.match(owner_pattern, line):\n",
    "            found_owner = re.match(owner_pattern, line)\n",
    "            owner_list = f\"list_{found_owner.group(1).lower()}\"\n",
    "            current_list = owner_list\n",
    "            result[current_list][\"tlist\"] = []\n",
    "        else:\n",
    "            if re.match(payment_pattern, line):\n",
    "                result[current_list][\"tlist\"].append(line)\n",
    "            elif re.match(line_pattern, line):\n",
    "                result[current_list][\"tlist\"].append(line)\n",
    "\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(create_lists())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_insert(input_dict: dict, account: int):\n",
    "    from datetime import date, datetime\n",
    "    import re\n",
    "\n",
    "    insert_bulk = []\n",
    "    line_group_pattern = r\"(\\d{2})\\.(\\d{2})\\.(\\d{4})((.+PARC (\\d+.)\\/(\\d+))(\\s.{12})|(.{23})(.{14}))(.{2})(\\s*-?\\d*\\.?\\d+,\\d{2})(\\s*\\d*\\.?\\d+,\\d{2})\"\n",
    "    payment_pattern = r\"(\\d{2})\\.(\\d{2})\\.(\\d{4})(PGTO DEBITO CONTA).*200211(\\s*-?\\d*\\.?\\d+,\\d{2})(\\s*\\d+,\\d{2})\"\n",
    "\n",
    "    for key in input_dict:\n",
    "        for item in input_dict[key][\"tlist\"]:\n",
    "            # * check for payment\n",
    "            matches = re.match(payment_pattern, item)\n",
    "            if matches:\n",
    "                tTdate = str(\n",
    "                    date(\n",
    "                        int(matches.group(3)),\n",
    "                        int(matches.group(2)),\n",
    "                        int(matches.group(1)),\n",
    "                    )\n",
    "                )\n",
    "                tAccount = account\n",
    "                tMemo = matches.group(4)\n",
    "                tCity = None\n",
    "                tCountry = None\n",
    "                tOutflow = None\n",
    "                tInflow = matches.group(5).strip().replace(\".\", \"\").replace(\",\", \".\")\n",
    "                tOwner = input_dict[key][\"owner_id\"]\n",
    "                tInstallmentNr = None\n",
    "                tInstallmentTt = None\n",
    "                tCreated = str(datetime.now(tz=None))\n",
    "                tUpdated = None\n",
    "            else:\n",
    "                matches = re.match(line_group_pattern, item)\n",
    "                tTdate = str(\n",
    "                    date(\n",
    "                        int(matches.group(3)),\n",
    "                        int(matches.group(2)),\n",
    "                        int(matches.group(1)),\n",
    "                    )\n",
    "                )\n",
    "                tAccount = account\n",
    "\n",
    "                # * check for Installments\n",
    "                if matches.group(5):\n",
    "                    tMemo = matches.group(5)\n",
    "                    tCity = matches.group(8)\n",
    "                    tInstallmentNr = int(matches.group(6))\n",
    "                    tInstallmentTt = int(matches.group(7))\n",
    "                else:\n",
    "                    tMemo = matches.group(9)\n",
    "                    tCity = matches.group(10)\n",
    "                    tInstallmentNr = 1\n",
    "                    tInstallmentTt = None\n",
    "\n",
    "                tCountry = matches.group(11)\n",
    "                tOutflow = matches.group(12).strip().replace(\".\", \"\").replace(\",\", \".\")\n",
    "                tInflow = matches.group(13).strip().replace(\".\", \"\").replace(\",\", \".\")\n",
    "                tOwner = input_dict[key][\"owner_id\"]\n",
    "\n",
    "                tCreated = str(datetime.now(tz=None))\n",
    "                tUpdated = None\n",
    "            insert_bulk.append(\n",
    "                (\n",
    "                    tTdate,\n",
    "                    tAccount,\n",
    "                    tMemo,\n",
    "                    tCity,\n",
    "                    tCountry,\n",
    "                    tOutflow,\n",
    "                    tInflow,\n",
    "                    tOwner,\n",
    "                    tInstallmentNr,\n",
    "                    tInstallmentTt,\n",
    "                    tCreated,\n",
    "                    tUpdated,\n",
    "                )\n",
    "            )\n",
    "\n",
    "    return insert_bulk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def db_insert(insert_bulk: list[tuple]):\n",
    "    from mysql.connector import connect, Error\n",
    "\n",
    "    try:\n",
    "        with connect(\n",
    "            host=\"localhost\",\n",
    "            user=\"root\",\n",
    "            password=\"pleasehashapasswordomg\",\n",
    "            database=\"default\",\n",
    "        ) as connection:\n",
    "            print(\"CONNECTED!\", connection)\n",
    "            with connection.cursor() as cursor:\n",
    "                cursor.executemany(insert_query, insert_bulk)\n",
    "            connection.commit()\n",
    "            print(\"DONE!\")\n",
    "    except Error as e:\n",
    "        print(e)\n",
    "    finally:\n",
    "        connection.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "db_insert(build_insert(create_lists(), 1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "build_insert(create_lists(), 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "create_lists()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dictTest = {\n",
    "    \"owner1\": {\n",
    "        \"owner_label\": \"foo\",\n",
    "        \"owner_id\": 1,\n",
    "        \"list1\": [\"thingies, thingies, 42\"],\n",
    "    },\n",
    "    \"owner2\": {\n",
    "        \"owner_label\": \"bar\",\n",
    "        \"owner_id\": 2,\n",
    "        \"list1\": [\"thingies, thingies, 42\"],\n",
    "    },\n",
    "}\n",
    "\n",
    "for owner in dictTest:\n",
    "    print(dictTest[owner][\"owner_id\"], dictTest[owner][\"owner_label\"])\n",
    "    for item in dictTest[owner][\"list1\"]:\n",
    "        print(item)\n",
    "\n",
    "dictTest[\"owner1\"][\"owner_label\"] = \"yadda\"\n",
    "\n",
    "for owner in dictTest:\n",
    "    print(dictTest[owner][\"owner_id\"], dictTest[owner][\"owner_label\"])\n",
    "    for item in dictTest[owner][\"list1\"]:\n",
    "        print(item)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "param1 = \"foo\"\n",
    "param2 = \"bar\"\n",
    "testy = {}\n",
    "testy[param1] = {}\n",
    "testy[param1][param2] = [\"what\", \"when\", \"why\"]\n",
    "testy[param1][\"number\"] = 1\n",
    "\n",
    "print(testy)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}