diff --git a/src/invoice2data/extract/parsers/lines.py b/src/invoice2data/extract/parsers/lines.py index bf2c8c65..1c830126 100644 --- a/src/invoice2data/extract/parsers/lines.py +++ b/src/invoice2data/extract/parsers/lines.py @@ -21,24 +21,10 @@ def parse_line(patterns, line): return None -def parse(template, field, _settings, content): - """Try to extract lines from the invoice""" - - # First apply default options. - settings = DEFAULT_OPTIONS.copy() - settings.update(_settings) - +def parse_block(template, field, settings, content): # Validate settings - assert "start" in settings, "Lines start regex missing" - assert "end" in settings, "Lines end regex missing" assert "line" in settings, "Line regex missing" - start = re.search(settings["start"], content) - end = re.search(settings["end"], content) - if not start or not end: - logger.warning(f"No lines found. Start match: {start}. End match: {end}") - return - content = content[start.end() : end.start()] lines = [] current_row = {} @@ -131,6 +117,43 @@ def parse(template, field, _settings, content): return lines +def parse(template, field, _settings, content): + # First apply default options. + settings = DEFAULT_OPTIONS.copy() + settings.update(_settings) + + # Validate settings + assert "start" in settings, "Lines start regex missing" + assert "end" in settings, "Lines end regex missing" + + blocks_count = 0 + lines = [] + + # Try finding & parsing blocks of lines one by one + while True: + start = re.search(settings["start"], content) + if not start: + break + content = content[start.end():] + + end = re.search(settings["end"], content) + if not end: + logger.warning("Failed to find lines block end") + break + + blocks_count += 1 + lines += parse_block(template, field, settings, content[0:end.start()]) + + content = content[end.end():] + + if blocks_count == 0: + logger.warning("Failed to find any matching block (part) of invoice for \"%s\"", field) + elif not lines: + logger.warning("Failed to find any lines for \"%s\"", field) + + return lines + + def parse_current_row(match, current_row): # Parse the current row data for field, value in match.groupdict().items(): diff --git a/tests/custom/lines-blocks.json b/tests/custom/lines-blocks.json new file mode 100644 index 00000000..7dab388a --- /dev/null +++ b/tests/custom/lines-blocks.json @@ -0,0 +1,17 @@ +[ + { + "issuer": "Lines Tests", + "date": "2022-10-15", + "invoice_number": "1234/10/2022", + "amount": 99.99, + "lines": [ + { "pos": 1, "name": "Cat" }, + { "pos": 2, "name": "Dog" }, + { "pos": 3, "name": "Frog" }, + { "pos": 4, "name": "Lizard" }, + { "pos": 5, "name": "Unicorn" } + ], + "currency": "EUR", + "desc": "Invoice from Lines Tests" + } +] diff --git a/tests/custom/lines-blocks.txt b/tests/custom/lines-blocks.txt new file mode 100644 index 00000000..20f1eb57 --- /dev/null +++ b/tests/custom/lines-blocks.txt @@ -0,0 +1,39 @@ +Issue date: 2022-10-15 +Issuer: Lines Tests +Invoice number: 1234/10/2022 +Total: 99.99 EUR + +Lines in multiple blocks + +Lines start +1. Cat +2. Dog +Lines end + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus quis metus sagittis, fermentum +risus et, vulputate orci. Curabitur id pellentesque mi, vel euismod nulla. Morbi tincidunt ipsum +eu volutpat dictum. Nam hendrerit varius mauris, a venenatis ligula lacinia et. Sed blandit +lobortis facilisis. Donec efficitur metus ac sapien luctus, eget facilisis dolor eleifend. In sapien +erat, vestibulum in sollicitudin a, euismod nec nunc. + +Lines start +3. Frog +Lines end + +Nulla elit dui, dictum in augue ac, rutrum mollis risus. In hac habitasse platea dictumst. Phasellus +quis eros ac elit iaculis vehicula et vel nunc. Aenean consequat in velit vel luctus. Proin vel +sapien cursus, ultrices turpis vel, fringilla dolor. Vestibulum ex leo, ullamcorper a quam quis, +molestie convallis est. Nulla egestas posuere purus, eget viverra elit dapibus et. Pellentesque +habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Duis posuere eros +dui. + +Lines start +4. Lizard +5. Unicorn +Lines end + +In varius nulla arcu, ac interdum velit ornare vel. Mauris a placerat lacus. Nam porta metus eget +arcu mattis, non iaculis elit luctus. Etiam rutrum volutpat arcu, vitae semper turpis mollis id. +Fusce orci dui, pellentesque et ipsum eget, pellentesque luctus leo. Nullam non mollis mi. In +semper, ex sed mollis dapibus, lectus metus vestibulum turpis, vitae convallis mauris eros in orci. +Interdum et malesuada fames ac ante ipsum primis in faucibus. diff --git a/tests/custom/templates/lines-blocks.yml b/tests/custom/templates/lines-blocks.yml new file mode 100644 index 00000000..30e1bd3e --- /dev/null +++ b/tests/custom/templates/lines-blocks.yml @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- +# SPDX-License-Identifier: MIT +issuer: Lines Tests +keywords: + - Lines Tests + - Lines in multiple blocks +fields: + date: + parser: regex + regex: Issue date:\s*(\d{4}-\d{2}-\d{2}) + type: date + invoice_number: + parser: regex + regex: Invoice number:\s*([\d/]+) + amount: + parser: regex + regex: Total:\s*(\d+\.\d\d) + type: float + lines: + parser: lines + start: Lines start + end: Lines end + line: ^(?P\d+)\.\s+(?P.+)$ + types: + pos: int +options: + currency: EUR + date_formats: + - '%Y-%m-%d' + decimal_separator: '.'