Updated Pre Consume Script Examples (markdown)

Jörg Menke 2024-03-15 20:16:48 +01:00
parent d1f7627d5f
commit a6ca23919e

@ -4,39 +4,59 @@ This wiki page is a repository of example [pre-consume scripts](https://docs.pap
## Removing Blank Pages ## Removing Blank Pages
- :warning: **This script modifies the original file** - :warning: **This script modifies the original file**
- Original source: https://github.com/paperless-ngx/paperless-ngx/discussions/668#discussioncomment-3936343 - Original source: https://github.com/paperless-ngx/paperless-ngx/discussions/668#discussioncomment-3936343 with slight update (suppress warnings for Apple PDFs)
```bash ```bash
#!/usr/bin/env bash #!/bin/bash
#set -x -e -o pipefail
set -e -o pipefail set -e -o pipefail
export LC_ALL=C export LC_ALL=C
THRESHOLD=0.002
#IN="$1" #IN="$1"
IN="${DOCUMENT_WORKING_PATH}" IN="$DOCUMENT_WORKING_PATH"
# Check for PDF format
TYPE=$(file -b "$IN")
if [ "${TYPE%%,*}" != "PDF document" ]; then
>&2 echo "Skipping $IN - non PDF [$TYPE]."
exit 0
fi
# PDF file - proceed
#PAGES=$(pdfinfo "$IN" | grep ^Pages: | tr -dc '0-9')
PAGES=$(pdfinfo "$IN" | awk '/Pages:/ {print $2}')
>&2 echo Total pages $PAGES
# Threshold for HP scanners
# THRESHOLD=1
# Threshold for Lexmar MC2425
THRESHOLD=0.8
PAGES=$(pdfinfo "${IN}" | grep -a "^Pages:" | tr -dc '0-9')
non_blank() { non_blank() {
for (( i="1"; i<="${PAGES}"; i++ )); do for i in $(seq 1 $PAGES) ; do
PERCENT=$(gs -o - -dFirstPage="${i}" -dLastPage="${i}" -sDEVICE=inkcov "${IN}" | grep CMYK | nawk 'BEGIN { sum=0; } {sum += $1 + $2 + $3 + $4;} END { printf "%.5f\n", sum } ') PERCENT=$(gs -o - -dFirstPage=${i} -dLastPage=${i} -sDEVICE=ink_cov "${IN}" | grep CMYK | nawk 'BEGIN { sum=0; } {sum += $1 + $2 + $3 + $4;} END { printf "%.5f\n", sum } ')
if awk "BEGIN { exit !(${PERCENT} > ${THRESHOLD}) }"; then >&2 echo -n "Color-sum in page $i is $PERCENT: "
echo "${i}" if awk "BEGIN { exit !($PERCENT > $THRESHOLD) }"; then
echo $i
>&2 echo "Page added to document"
else else
>&2 echo "Color-sum is ${PERCENT}: will remove blank page ${i} of ${IN}" >&2 echo "Page removed from document"
fi fi
done done
} }
NON_BLANK="$(non_blank)" NON_BLANK=$(non_blank)
NON_BLANK="$(tr '\n' ' ' <<<"${NON_BLANK}")"
NON_BLANK="${NON_BLANK% }"
if [ -n "${NON_BLANK}" ]; then if [ -n "$NON_BLANK" ]; then
NON_BLANK=$(echo "${NON_BLANK}" | tr ' ' ",") NON_BLANK=$(echo $NON_BLANK | tr ' ' ",")
qpdf "${IN}" --replace-input --pages . "${NON_BLANK}" -- qpdf "$IN" --warning-exit-0 --replace-input --pages . $NON_BLANK --
fi fi
``` ```
## Cleaning with `qpdf` ## Cleaning with `qpdf`