Updated Pre Consume Script Examples (markdown)

Jörg Menke 2024-03-15 20:16:48 +01:00
parent d1f7627d5f
commit a6ca23919e

@ -4,39 +4,59 @@ This wiki page is a repository of example [pre-consume scripts](https://docs.pap
## Removing Blank Pages
- :warning: **This script modifies the original file**
- Original source: https://github.com/paperless-ngx/paperless-ngx/discussions/668#discussioncomment-3936343
- Original source: https://github.com/paperless-ngx/paperless-ngx/discussions/668#discussioncomment-3936343 with slight update (suppress warnings for Apple PDFs)
```bash
#!/usr/bin/env bash
#!/bin/bash
#set -x -e -o pipefail
set -e -o pipefail
export LC_ALL=C
THRESHOLD=0.002
#IN="$1"
IN="${DOCUMENT_WORKING_PATH}"
IN="$DOCUMENT_WORKING_PATH"
# Check for PDF format
TYPE=$(file -b "$IN")
if [ "${TYPE%%,*}" != "PDF document" ]; then
>&2 echo "Skipping $IN - non PDF [$TYPE]."
exit 0
fi
# PDF file - proceed
#PAGES=$(pdfinfo "$IN" | grep ^Pages: | tr -dc '0-9')
PAGES=$(pdfinfo "$IN" | awk '/Pages:/ {print $2}')
>&2 echo Total pages $PAGES
# Threshold for HP scanners
# THRESHOLD=1
# Threshold for Lexmar MC2425
THRESHOLD=0.8
PAGES=$(pdfinfo "${IN}" | grep -a "^Pages:" | tr -dc '0-9')
non_blank() {
for (( i="1"; i<="${PAGES}"; i++ )); do
PERCENT=$(gs -o - -dFirstPage="${i}" -dLastPage="${i}" -sDEVICE=inkcov "${IN}" | grep CMYK | nawk 'BEGIN { sum=0; } {sum += $1 + $2 + $3 + $4;} END { printf "%.5f\n", sum } ')
if awk "BEGIN { exit !(${PERCENT} > ${THRESHOLD}) }"; then
echo "${i}"
for i in $(seq 1 $PAGES) ; do
PERCENT=$(gs -o - -dFirstPage=${i} -dLastPage=${i} -sDEVICE=ink_cov "${IN}" | grep CMYK | nawk 'BEGIN { sum=0; } {sum += $1 + $2 + $3 + $4;} END { printf "%.5f\n", sum } ')
>&2 echo -n "Color-sum in page $i is $PERCENT: "
if awk "BEGIN { exit !($PERCENT > $THRESHOLD) }"; then
echo $i
>&2 echo "Page added to document"
else
>&2 echo "Color-sum is ${PERCENT}: will remove blank page ${i} of ${IN}"
>&2 echo "Page removed from document"
fi
done
done
}
NON_BLANK="$(non_blank)"
NON_BLANK="$(tr '\n' ' ' <<<"${NON_BLANK}")"
NON_BLANK="${NON_BLANK% }"
NON_BLANK=$(non_blank)
if [ -n "${NON_BLANK}" ]; then
NON_BLANK=$(echo "${NON_BLANK}" | tr ' ' ",")
qpdf "${IN}" --replace-input --pages . "${NON_BLANK}" --
if [ -n "$NON_BLANK" ]; then
NON_BLANK=$(echo $NON_BLANK | tr ' ' ",")
qpdf "$IN" --warning-exit-0 --replace-input --pages . $NON_BLANK --
fi
```
## Cleaning with `qpdf`