mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 03:16:10 -06:00 
			
		
		
		
	split function for reading barcode and separating pages
This commit is contained in:
		@@ -107,14 +107,17 @@ def convert_from_tiff_to_pdf(filepath: str) -> str:
 | 
				
			|||||||
    return newpath
 | 
					    return newpath
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], List[int]]:
 | 
					def scan_file_for_barcodes(
 | 
				
			||||||
 | 
					    filepath: str,
 | 
				
			||||||
 | 
					) -> Tuple[Optional[str], List[Tuple[int, str]]]:
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Scan the provided pdf file for page separating barcodes
 | 
					    Scan the provided pdf file for any barcodes
 | 
				
			||||||
    Returns a PDF filepath and a list of pagenumbers,
 | 
					    Returns a PDF filepath and a list of
 | 
				
			||||||
    which separate the file into new files
 | 
					    (page_number, barcode_text) tuples
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _pikepdf_barcode_scan(pdf_filepath: str):
 | 
					    def _pikepdf_barcode_scan(pdf_filepath: str):
 | 
				
			||||||
 | 
					        detected_barcodes = []
 | 
				
			||||||
        with Pdf.open(pdf_filepath) as pdf:
 | 
					        with Pdf.open(pdf_filepath) as pdf:
 | 
				
			||||||
            for page_num, page in enumerate(pdf.pages):
 | 
					            for page_num, page in enumerate(pdf.pages):
 | 
				
			||||||
                for image_key in page.images:
 | 
					                for image_key in page.images:
 | 
				
			||||||
@@ -132,24 +135,27 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
 | 
				
			|||||||
                    # raise an exception, triggering fallback
 | 
					                    # raise an exception, triggering fallback
 | 
				
			||||||
                    pillow_img = pdfimage.as_pil_image()
 | 
					                    pillow_img = pdfimage.as_pil_image()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                    detected_barcodes = barcode_reader(pillow_img)
 | 
					                    barcodes_on_page = barcode_reader(pillow_img)
 | 
				
			||||||
 | 
					                    detected_barcodes.extend(
 | 
				
			||||||
                    if settings.CONSUMER_BARCODE_STRING in detected_barcodes:
 | 
					                        [(page_num, text) for text in barcodes_on_page]
 | 
				
			||||||
                        separator_page_numbers.append(page_num)
 | 
					                    )
 | 
				
			||||||
 | 
					        return detected_barcodes
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _pdf2image_barcode_scan(pdf_filepath: str):
 | 
					    def _pdf2image_barcode_scan(pdf_filepath: str):
 | 
				
			||||||
 | 
					        detected_barcodes = []
 | 
				
			||||||
        # use a temporary directory in case the file is too big to handle in memory
 | 
					        # use a temporary directory in case the file is too big to handle in memory
 | 
				
			||||||
        with tempfile.TemporaryDirectory() as path:
 | 
					        with tempfile.TemporaryDirectory() as path:
 | 
				
			||||||
            pages_from_path = convert_from_path(pdf_filepath, output_folder=path)
 | 
					            pages_from_path = convert_from_path(pdf_filepath, output_folder=path)
 | 
				
			||||||
            for current_page_number, page in enumerate(pages_from_path):
 | 
					            for current_page_number, page in enumerate(pages_from_path):
 | 
				
			||||||
                current_barcodes = barcode_reader(page)
 | 
					                barcodes_on_page = barcode_reader(page)
 | 
				
			||||||
                if settings.CONSUMER_BARCODE_STRING in current_barcodes:
 | 
					                detected_barcodes.extend(
 | 
				
			||||||
                    separator_page_numbers.append(current_page_number)
 | 
					                    [(current_page_number, text) for text in barcodes_on_page]
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					        return detected_barcodes
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    separator_page_numbers = []
 | 
					 | 
				
			||||||
    pdf_filepath = None
 | 
					    pdf_filepath = None
 | 
				
			||||||
 | 
					 | 
				
			||||||
    mime_type = get_file_mime_type(filepath)
 | 
					    mime_type = get_file_mime_type(filepath)
 | 
				
			||||||
 | 
					    barcodes = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if supported_file_type(mime_type):
 | 
					    if supported_file_type(mime_type):
 | 
				
			||||||
        pdf_filepath = filepath
 | 
					        pdf_filepath = filepath
 | 
				
			||||||
@@ -159,7 +165,7 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
 | 
				
			|||||||
        # Always try pikepdf first, it's usually fine, faster and
 | 
					        # Always try pikepdf first, it's usually fine, faster and
 | 
				
			||||||
        # uses less memory
 | 
					        # uses less memory
 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
            _pikepdf_barcode_scan(pdf_filepath)
 | 
					            barcodes = _pikepdf_barcode_scan(pdf_filepath)
 | 
				
			||||||
        # Password protected files can't be checked
 | 
					        # Password protected files can't be checked
 | 
				
			||||||
        except PasswordError as e:
 | 
					        except PasswordError as e:
 | 
				
			||||||
            logger.warning(
 | 
					            logger.warning(
 | 
				
			||||||
@@ -172,9 +178,7 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
 | 
				
			|||||||
                f"Falling back to pdf2image because: {e}",
 | 
					                f"Falling back to pdf2image because: {e}",
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
            try:
 | 
					            try:
 | 
				
			||||||
                # Clear the list in case some processing worked
 | 
					                barcodes = _pdf2image_barcode_scan(pdf_filepath)
 | 
				
			||||||
                separator_page_numbers = []
 | 
					 | 
				
			||||||
                _pdf2image_barcode_scan(pdf_filepath)
 | 
					 | 
				
			||||||
            # This file is really borked, allow the consumption to continue
 | 
					            # This file is really borked, allow the consumption to continue
 | 
				
			||||||
            # but it may fail further on
 | 
					            # but it may fail further on
 | 
				
			||||||
            except Exception as e:  # pragma: no cover
 | 
					            except Exception as e:  # pragma: no cover
 | 
				
			||||||
@@ -186,6 +190,28 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
 | 
				
			|||||||
        logger.warning(
 | 
					        logger.warning(
 | 
				
			||||||
            f"Unsupported file format for barcode reader: {str(mime_type)}",
 | 
					            f"Unsupported file format for barcode reader: {str(mime_type)}",
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return pdf_filepath, barcodes
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], List[int]]:
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    Scan the provided pdf file for page separating barcodes
 | 
				
			||||||
 | 
					    Returns a PDF filepath and a list of pagenumbers,
 | 
				
			||||||
 | 
					    which separate the file into new files
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    separator_page_numbers = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    pdf_filepath, barcodes = scan_file_for_barcodes(filepath)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # filter all barcodes for the separator string
 | 
				
			||||||
 | 
					    separator_barcodes = list(
 | 
				
			||||||
 | 
					        filter(lambda bc: bc[1] == settings.CONSUMER_BARCODE_STRING, barcodes),
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # get the page numbers of the separating barcodes
 | 
				
			||||||
 | 
					    separator_page_numbers = [page for page, _ in separator_barcodes]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return pdf_filepath, separator_page_numbers
 | 
					    return pdf_filepath, separator_page_numbers
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user