Source code for mt940.parser

"""
Format
---------------------

Sources:

.. _Swift for corporates: http://www.sepaforcorporates.com/\
    swift-for-corporates/account-statement-mt940-file-format-overview/
.. _Rabobank MT940: https://www.rabobank.nl/images/\
    formaatbeschrijving_swift_bt940s_1_0_nl_rib_29539296.pdf

 - `Swift for corporates`_
 - `Rabobank MT940`_

::

    [] = optional
    ! = fixed length
    a = Text
    x = Alphanumeric, seems more like text actually. Can include special
        characters (slashes) and whitespace as well as letters and numbers
    d = Numeric separated by decimal (usually comma)
    c = Code list value
    n = Numeric
"""

from __future__ import annotations

import os
import re
from collections.abc import Iterable
from typing import TYPE_CHECKING, Any

import mt940

if TYPE_CHECKING:
    from ._types import Processors, Source
    from .models import Transactions


def _read(src: Any, encoding: str | None = None) -> str:
    """Read raw mt940 data from a file handle, path or string and decode it."""

    def safe_is_file(filename: Any) -> bool:
        try:
            return os.path.isfile(filename)
        except ValueError:  # pragma: no cover
            return False

    if hasattr(src, 'read'):  # pragma: no branch
        data = src.read()
    elif safe_is_file(src):
        with open(src, 'rb') as fh:
            data = fh.read()
    else:  # pragma: no cover
        data = src

    if hasattr(data, 'decode'):  # pragma: no branch
        exception = None
        encodings = [encoding, 'utf-8', 'cp852', 'iso8859-15', 'latin1']

        for enc in encodings:  # pragma: no cover
            if not enc:
                continue

            try:
                data = data.decode(enc)
                break
            except UnicodeDecodeError as e:
                exception = e
            except UnicodeEncodeError:
                break
        else:  # pragma: no cover
            assert exception is not None
            raise exception  # pragma: no cover

    assert isinstance(data, str)
    return data



[docs]
def parse(
    src: Source,
    encoding: str | None = None,
    processors: Processors | None = None,
    tags: dict[int | str, mt940.tags.Tag] | None = None,
    transaction_boundary: Iterable[str] | None = None,
) -> Transactions:
    """Parse MT940 data into a single :class:`~mt940.models.Transactions`.

    Args:
        src: A file handle, a filename to read, or the raw data as
            ``str``/``bytes``.
        encoding: Optional encoding override for byte input.
        processors: Optional extra pre/post processors.
        tags: Optional extra or overriding tag parsers.
        transaction_boundary: Optional iterable of tag *slugs* that each start
            a new transaction (issue #110). By default only ``:61:`` starts a
            transaction; pass e.g. ``{'transaction_reference_number'}`` to also
            start one on every ``:20:``. Omit it to keep the legacy behaviour.

    Returns:
        The parsed collection of transactions.
    """
    data = _read(src, encoding)
    transactions = mt940.models.Transactions(
        processors, tags, transaction_boundary=transaction_boundary
    )
    transactions.parse(data)

    return transactions




[docs]
def parse_statements(
    src: Source,
    encoding: str | None = None,
    processors: Processors | None = None,
    tags: dict[int | str, mt940.tags.Tag] | None = None,
    transaction_boundary: Iterable[str] | None = None,
) -> list[Transactions]:
    """
    Parse an mt940 file that contains multiple statement blocks.

    Unlike :func:`parse`, which merges everything into a single
    :class:`~mt940.models.Transactions`, this splits the input on ``:20:``
    statement boundaries and parses each block into its own
    :class:`~mt940.models.Transactions`. Use it for files that concatenate
    several statements (e.g. balance-only blocks), where a single
    ``Transactions`` would only keep the last block's statement-level data such
    as the opening/closing/available balances (issue #107).

    Each ``:20:`` is treated as the start of a new statement, matching the
    standard where ``:20:`` is the once-per-statement transaction reference.
    This is therefore mutually exclusive with
    ``transaction_boundary={'transaction_reference_number'}`` (issue #110),
    which instead treats ``:20:`` as an *intra*-statement transaction boundary;
    the two target different, non-standard bank formats -- don't combine them.

    Args:
        src: A file handle, a filename to read, or the raw data as
            ``str``/``bytes``.
        encoding: Optional encoding override for byte input.
        processors: Optional extra pre/post processors (applied per block).
        tags: Optional extra or overriding tag parsers (applied per block).
        transaction_boundary: See :func:`parse` (and the note above).

    Returns:
        One :class:`~mt940.models.Transactions` per statement block.
    """
    data = _read(src, encoding)
    statements: list[Transactions] = []
    for block in re.split(r'(?m)^(?=:20:)', data):
        if not block.strip().startswith(':20:'):
            # Drop any leading header / empty chunk before the first :20:.
            continue
        transactions = mt940.models.Transactions(
            processors, tags, transaction_boundary=transaction_boundary
        )
        transactions.parse(block)
        statements.append(transactions)

    return statements