diff --git a/etl.py b/etl.py index c3df4dd..07c846d 100644 --- a/etl.py +++ b/etl.py @@ -4,18 +4,27 @@ from dotenv import load_dotenv load_dotenv() insert_query = "INSERT INTO default.TRANSACTION (TDATE, ACCOUNTID, MEMO, CITY, COUNTRY, OUTFLOW, INFLOW, OWNERID, INSTALLMENT_NR, INSTALLMENT_TT, CREATED, UPDATED) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s )" -input_file = os.getenv('INPUT_FILE') +input_file = os.getenv("INPUT_FILE") -def create_lists(input_file: str): + +def create_lists(): import re - with open(input_file, "r", encoding="latin") as file: + # Open the text file + # with open("OUROCARD_VISA_INFINITE-Próxima_Fatura.txt", "r", encoding="latin") as file: + with open("OUROCARD_VISA_INFINITE-Ago_24.txt", "r", encoding="latin") as file: + # Read the contents of the file contents = file.readlines() - # REGEX - owner_pattern = r"\d\s*-\s*(\w+)" + # Define the regex patterns + owner_pattern = r"\d\s?-\s?([A-Z]+)" line_pattern = r"\d{2}\.\d{2}\.\d{4}.{23}.{14}.{2}\s*-?\d*\.?\d+,\d{2}\s*\d+,\d{2}" - payment_pattern = (r"\d{2}\.\d{2}\.\d{4}PGTO.*200211(\s*-?\d*\.?\d+,\d{2})(\s*\d+,\d{2})") + payment_pattern = ( + r"\d{2}\.\d{2}\.\d{4}PGTO.*200211(\s*-?\d*\.?\d+,\d{2})(\s*\d+,\d{2})" + ) + partial_invoice_line_pattern = ( + r"\d{2}\/\d{2}.{27}.{16}.{2}\s+\s*-?\d*\.?\d+,\d{2}\s*\d+,\d{2}" + ) # Lists current_list = None @@ -23,6 +32,7 @@ def create_lists(input_file: str): result = {} silly_counter = 1 + isPartial = True # Find Owners for line in contents: @@ -38,6 +48,7 @@ def create_lists(input_file: str): result[list_name]["owner_id"] = silly_counter silly_counter = silly_counter + 1 + # Treat and create transaction lists for line in contents: line = line.strip() @@ -49,9 +60,19 @@ def create_lists(input_file: str): else: if re.match(payment_pattern, line): result[current_list]["tlist"].append(line) - elif re.match(line_pattern, line): + elif re.match(line_pattern, line) or re.match( + partial_invoice_line_pattern, line + ): result[current_list]["tlist"].append(line) + # Check file pattern + sample = result[current_list]["tlist"][0] + if re.match(line_pattern, sample): + isPartial = False + + for listObj in result: + result[listObj]["isPartial"] = isPartial + return result @@ -60,58 +81,81 @@ def build_insert(input_dict: dict, account: int): import re insert_bulk = [] - line_group_pattern = r"(\d{2})\.(\d{2})\.(\d{4})((.+PARC (\d+.)\/(\d+))(\s.{12})|(.{23})(.{14}))(.{2})(\s*-?\d*\.?\d+,\d{2})(\s*\d*\.?\d+,\d{2})" - payment_pattern = r"(\d{2})\.(\d{2})\.(\d{4})(PGTO DEBITO CONTA).*200211(\s*-?\d*\.?\d+,\d{2})(\s*\d+,\d{2})" + + # RegEx Patterns + line_group_pattern = r"(?P\d{2})\.(?P\d{2})\.(?P\d{4})(?:(?P.+PARC (?P\d+.)\/(?P\d+)\s.{12})|(?P.{37}))(?P.{2})(?P\s*-?\d*\.?\d+,\d{2})(?P\s*\d*\.?\d+,\d{2})" + partial_invoice_group_pattern = r"(?P\d{2})\/(?P\d{2})(?:(?P.+PARC (?P\d{2})\/(?P\d{2}).{15})|(?P.{43}))(?P.{2})(?P\s+\s*-?\d*\.?\d+,\d{2})(?P\s*\d+,\d{2})" + payment_pattern = r"(?P\d{2})\.(?P\d{2})\.(?P\d{4})(?PPGTO DEBITO CONTA).*200211(?P\s*-?\d*\.?\d+,\d{2})(?P\s*\d+,\d{2})" for key in input_dict: + if input_dict[key]["isPartial"]: + pattern_to_use = partial_invoice_group_pattern + else: + pattern_to_use = line_group_pattern + for item in input_dict[key]["tlist"]: - # * check for payment + # check for payment matches = re.match(payment_pattern, item) if matches: tTdate = str( date( - int(matches.group(3)), - int(matches.group(2)), - int(matches.group(1)), + int(matches.group("year")), + int(matches.group("month")), + int(matches.group("day")), ) ) tAccount = account - tMemo = matches.group(4) - tCity = None + tMemo = matches.group("memo") tCountry = None tOutflow = None - tInflow = matches.group(5).strip().replace(".", "").replace(",", ".") + tInflow = ( + matches.group("inflow") + .strip() + .replace(".", "") + .replace(",", ".") + .replace("-", "") + ) tOwner = input_dict[key]["owner_id"] tInstallmentNr = None tInstallmentTt = None tCreated = str(datetime.now(tz=None)) tUpdated = None else: - matches = re.match(line_group_pattern, item) + matches = re.match(pattern_to_use, item) tTdate = str( date( - int(matches.group(3)), - int(matches.group(2)), - int(matches.group(1)), + # partial files will not have the year data on transactions + ( + int(matches.group("year")) + if pattern_to_use == line_group_pattern + else datetime.now().year + ), + int(matches.group("month")), + int(matches.group("day")), ) ) + tAccount = account - # * check for Installments - if matches.group(5): - tMemo = matches.group(5) - tCity = matches.group(8) - tInstallmentNr = int(matches.group(6)) - tInstallmentTt = int(matches.group(7)) - else: - tMemo = matches.group(9) - tCity = matches.group(10) - tInstallmentNr = 1 - tInstallmentTt = None - - tCountry = matches.group(11) - tOutflow = matches.group(12).strip().replace(".", "").replace(",", ".") - tInflow = matches.group(13).strip().replace(".", "").replace(",", ".") + tMemo = ( + matches.group("p_memo") + if matches.group("p_memo") + else matches.group("memo") + ) + tInstallmentNr = ( + int(matches.group("p_nr")) if matches.group("p_nr") else None + ) + tInstallmentTt = ( + int(matches.group("p_tt")) if matches.group("p_tt") else None + ) + + tCountry = matches.group("country") + tOutflow = ( + matches.group("outflow").strip().replace(".", "").replace(",", ".") + ) + tInflow = ( + matches.group("inflow").strip().replace(".", "").replace(",", ".") + ) tOwner = input_dict[key]["owner_id"] tCreated = str(datetime.now(tz=None)) @@ -121,7 +165,6 @@ def build_insert(input_dict: dict, account: int): tTdate, tAccount, tMemo, - tCity, tCountry, tOutflow, tInflow, @@ -131,10 +174,11 @@ def build_insert(input_dict: dict, account: int): tCreated, tUpdated, ) - ) + ) return insert_bulk + def db_insert(insert_bulk: list[tuple]): from mysql.connector import connect, Error @@ -155,4 +199,5 @@ def db_insert(insert_bulk: list[tuple]): finally: connection.close() -db_insert(build_insert(create_lists(), 1)) \ No newline at end of file + +db_insert(build_insert(create_lists(), 1))