diff --git a/Pre-Consume-Script-Examples.md b/Pre-Consume-Script-Examples.md index bd1530b..0fb2219 100644 --- a/Pre-Consume-Script-Examples.md +++ b/Pre-Consume-Script-Examples.md @@ -4,39 +4,59 @@ This wiki page is a repository of example [pre-consume scripts](https://docs.pap ## Removing Blank Pages - :warning: **This script modifies the original file** -- Original source: https://github.com/paperless-ngx/paperless-ngx/discussions/668#discussioncomment-3936343 +- Original source: https://github.com/paperless-ngx/paperless-ngx/discussions/668#discussioncomment-3936343 with slight update (suppress warnings for Apple PDFs) ```bash -#!/usr/bin/env bash +#!/bin/bash +#set -x -e -o pipefail set -e -o pipefail export LC_ALL=C -THRESHOLD=0.002 - #IN="$1" -IN="${DOCUMENT_WORKING_PATH}" +IN="$DOCUMENT_WORKING_PATH" + +# Check for PDF format +TYPE=$(file -b "$IN") + +if [ "${TYPE%%,*}" != "PDF document" ]; then + >&2 echo "Skipping $IN - non PDF [$TYPE]." + exit 0 +fi + +# PDF file - proceed + +#PAGES=$(pdfinfo "$IN" | grep ^Pages: | tr -dc '0-9') +PAGES=$(pdfinfo "$IN" | awk '/Pages:/ {print $2}') + +>&2 echo Total pages $PAGES + + +# Threshold for HP scanners +# THRESHOLD=1 +# Threshold for Lexmar MC2425 +THRESHOLD=0.8 -PAGES=$(pdfinfo "${IN}" | grep -a "^Pages:" | tr -dc '0-9') non_blank() { -for (( i="1"; i<="${PAGES}"; i++ )); do -PERCENT=$(gs -o - -dFirstPage="${i}" -dLastPage="${i}" -sDEVICE=inkcov "${IN}" | grep CMYK | nawk 'BEGIN { sum=0; } {sum += $1 + $2 + $3 + $4;} END { printf "%.5f\n", sum } ') - if awk "BEGIN { exit !(${PERCENT} > ${THRESHOLD}) }"; then - echo "${i}" + for i in $(seq 1 $PAGES) ; do + PERCENT=$(gs -o - -dFirstPage=${i} -dLastPage=${i} -sDEVICE=ink_cov "${IN}" | grep CMYK | nawk 'BEGIN { sum=0; } {sum += $1 + $2 + $3 + $4;} END { printf "%.5f\n", sum } ') + >&2 echo -n "Color-sum in page $i is $PERCENT: " + if awk "BEGIN { exit !($PERCENT > $THRESHOLD) }"; then + echo $i + >&2 echo "Page added to document" else - >&2 echo "Color-sum is ${PERCENT}: will remove blank page ${i} of ${IN}" + >&2 echo "Page removed from document" fi -done + done } -NON_BLANK="$(non_blank)" -NON_BLANK="$(tr '\n' ' ' <<<"${NON_BLANK}")" -NON_BLANK="${NON_BLANK% }" +NON_BLANK=$(non_blank) -if [ -n "${NON_BLANK}" ]; then - NON_BLANK=$(echo "${NON_BLANK}" | tr ' ' ",") - qpdf "${IN}" --replace-input --pages . "${NON_BLANK}" -- +if [ -n "$NON_BLANK" ]; then + NON_BLANK=$(echo $NON_BLANK | tr ' ' ",") + qpdf "$IN" --warning-exit-0 --replace-input --pages . $NON_BLANK -- fi + ``` ## Cleaning with `qpdf`