paperless-ngx/src/paperless_text/parsers.py

import os
import subprocess

from django.conf import settings

from documents.parsers import DocumentParser, ParseError


class TextDocumentParser(DocumentParser):
    """
    This parser directly parses a text document (.txt, .md, or .csv)
    """

    CONVERT = settings.CONVERT_BINARY
    THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
    UNPAPER = settings.UNPAPER_BINARY
    DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
    OCR_ALWAYS = settings.OCR_ALWAYS

    def __init__(self, path):
        super().__init__(path)
        self._text = None

    def get_thumbnail(self):
        """
        The thumbnail of a text file is just a 500px wide image of the text
        rendered onto a letter-sized page.
        """
        # The below is heavily cribbed from https://askubuntu.com/a/590951

        bg_color = "white"  # bg color
        text_color = "black"  # text color
        psize = [500, 647]  # icon size
        n_lines = 50  # number of lines to show
        out_path = os.path.join(self.tempdir, "convert.png")

        temp_bg = os.path.join(self.tempdir, "bg.png")
        temp_txlayer = os.path.join(self.tempdir, "tx.png")
        picsize = "x".join([str(n) for n in psize])
        txsize = "x".join([str(n - 8) for n in psize])

        def create_bg():
            work_size = ",".join([str(n - 1) for n in psize])
            r = str(round(psize[0] / 10))
            rounded = ",".join([r, r])
            run_command(
                self.CONVERT,
                "-size ", picsize,
                ' xc:none -draw ',
                '"fill ', bg_color, ' roundrectangle 0,0,', work_size, ",", rounded, '" ',  # NOQA: E501
                temp_bg
            )

        def read_text():
            with open(self.document_path, 'r') as src:
                lines = [l.strip() for l in src.readlines()]
                text = "\n".join([l for l in lines[:n_lines]])
                return text.replace('"', "'")

        def create_txlayer():
            run_command(
                self.CONVERT,
                "-background none",
                "-fill",
                text_color,
                "-pointsize", "12",
                "-border 4 -bordercolor none",
                "-size ", txsize,
                ' caption:"', read_text(), '" ',
                temp_txlayer
            )

        create_txlayer()
        create_bg()
        run_command(
            self.CONVERT,
            temp_bg,
            temp_txlayer,
            "-background None -layers merge ",
            out_path
        )

        return out_path

    def get_text(self):

        if self._text is not None:
            return self._text

        with open(self.document_path, 'r') as f:
            self._text = f.read()

        return self._text


def run_command(*args):
    environment = os.environ.copy()
    if settings.CONVERT_MEMORY_LIMIT:
        environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
    if settings.CONVERT_TMPDIR:
        environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR

    if not subprocess.Popen(' '.join(args), env=environment,
                            shell=True).wait() == 0:
        raise ParseError("Convert failed at {}".format(args))