mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Merge branch 'dev' into feature-ocrmypdf
This commit is contained in:
		| @@ -15,7 +15,7 @@ services: | |||||||
|       POSTGRES_PASSWORD: paperless |       POSTGRES_PASSWORD: paperless | ||||||
|  |  | ||||||
|   webserver: |   webserver: | ||||||
|     image: jonaswinkler/paperless-ng:0.9.3 |     image: jonaswinkler/paperless-ng:0.9.4 | ||||||
|     restart: always |     restart: always | ||||||
|     depends_on: |     depends_on: | ||||||
|       - db |       - db | ||||||
|   | |||||||
| @@ -5,7 +5,7 @@ services: | |||||||
|     restart: always |     restart: always | ||||||
|  |  | ||||||
|   webserver: |   webserver: | ||||||
|     image: jonaswinkler/paperless-ng:0.9.3 |     image: jonaswinkler/paperless-ng:0.9.4 | ||||||
|     restart: always |     restart: always | ||||||
|     depends_on: |     depends_on: | ||||||
|       - broker |       - broker | ||||||
|   | |||||||
| @@ -274,6 +274,7 @@ management command: | |||||||
|  |  | ||||||
| This command takes no arguments. | This command takes no arguments. | ||||||
|  |  | ||||||
|  | .. _`administration-index`: | ||||||
|  |  | ||||||
| Managing the document search index | Managing the document search index | ||||||
| ================================== | ================================== | ||||||
|   | |||||||
| @@ -8,12 +8,31 @@ Changelog | |||||||
| paperless-ng 0.9.4 | paperless-ng 0.9.4 | ||||||
| ################## | ################## | ||||||
|  |  | ||||||
| * Front end: Clickable tags, correspondents and types allow quick filtering for related documents. | * Searching: | ||||||
| * Front end: Saved views are now editable. |  | ||||||
| * Front end: Preview documents directly in the browser. |   * Paperless now supports searching by tags, types and dates. In order to have this applied to your | ||||||
|  |     existing documents, you need to perform a ``document_index reindex`` management command | ||||||
|  |     (see :ref:`administration-index`) | ||||||
|  |     that adds the new data to the search index. You only need to do this once, so that paperless can find | ||||||
|  |     your documents by tags,types and dates. Paperless keeps the index updated after that whenever | ||||||
|  |     something changes. | ||||||
|  |   * Paperless now has spelling corrections ("Did you mean") for misstyped queries. | ||||||
|  |   * The documentation contains :ref:`information about the query syntax <basic-searching>`. | ||||||
|  |  | ||||||
|  | * Front end: | ||||||
|  |  | ||||||
|  |   * Clickable tags, correspondents and types allow quick filtering for related documents. | ||||||
|  |   * Saved views are now editable. | ||||||
|  |   * Preview documents directly in the browser. | ||||||
|  |   * Navigation from the dashboard to saved views. | ||||||
|  |  | ||||||
| * Fixes: | * Fixes: | ||||||
|  |  | ||||||
|   * A severe error when trying to use post consume scripts. |   * A severe error when trying to use post consume scripts. | ||||||
| * The documentation now contains information about bare metal installs. |   * An error in the consumer that cause invalid messages of missing files to show up in the log. | ||||||
|  |  | ||||||
|  | * The documentation now contains information about bare metal installs and a section about | ||||||
|  |   how to setup the development environment. | ||||||
|  |  | ||||||
| paperless-ng 0.9.3 | paperless-ng 0.9.3 | ||||||
| ################## | ################## | ||||||
|   | |||||||
| @@ -156,6 +156,62 @@ REST API | |||||||
|  |  | ||||||
| You can also submit a document using the REST API, see :ref:`api-file_uploads` for details. | You can also submit a document using the REST API, see :ref:`api-file_uploads` for details. | ||||||
|  |  | ||||||
|  | .. _basic-searching: | ||||||
|  |  | ||||||
|  | Searching | ||||||
|  | ######### | ||||||
|  |  | ||||||
|  | Paperless offers an extensive searching mechanism that is designed to allow you to quickly | ||||||
|  | find a document you're looking for (for example, that thing that just broke and you bought | ||||||
|  | a couple months ago, that contract you signed 8 years ago). | ||||||
|  |  | ||||||
|  | When you search paperless for a document, it tries to match this query against your documents. | ||||||
|  | Paperless will look for matching documents by inspecting their content, title, correspondent, | ||||||
|  | type and tags. Paperless returns a scored list of results, so that documents matching your query | ||||||
|  | better will appear further up in the search results. | ||||||
|  |  | ||||||
|  | By default, paperless returns only documents which contain all words typed in the search bar. | ||||||
|  | However, paperless also offers advanced search syntax if you want to drill down the results | ||||||
|  | further. | ||||||
|  |  | ||||||
|  | Matching documents with logical expressions: | ||||||
|  |  | ||||||
|  | .. code:: none | ||||||
|  |  | ||||||
|  |   shopname AND (product1 OR product2) | ||||||
|  |  | ||||||
|  | Matching specific tags, correspondents or types: | ||||||
|  |  | ||||||
|  | .. code:: none | ||||||
|  |  | ||||||
|  |   type:invoice tag:unpaid | ||||||
|  |   correspondent:university certificate | ||||||
|  |  | ||||||
|  | Matching dates: | ||||||
|  |  | ||||||
|  | .. code:: none | ||||||
|  |    | ||||||
|  |   created:[2005 to 2009] | ||||||
|  |   added:yesterday | ||||||
|  |   modified:today | ||||||
|  |  | ||||||
|  | Matching inexact words: | ||||||
|  |  | ||||||
|  | .. code:: none | ||||||
|  |  | ||||||
|  |   produ*name | ||||||
|  |  | ||||||
|  | .. note:: | ||||||
|  |  | ||||||
|  |   Inexact terms are hard for search indexes. These queries might take a while to execute. That's why paperless offers | ||||||
|  |   auto complete and query correction. | ||||||
|  |  | ||||||
|  | All of these constructs can be combined as you see fit. | ||||||
|  | If you want to learn more about the query language used by paperless, paperless uses Whoosh's default query language.  | ||||||
|  | Head over to `Whoosh query language <https://whoosh.readthedocs.io/en/latest/querylang.html>`_. | ||||||
|  | For details on what date parsing utilities are available, see | ||||||
|  | `Date parsing <https://whoosh.readthedocs.io/en/latest/dates.html#parsing-date-queries>`_. | ||||||
|  |   | ||||||
|  |  | ||||||
| .. _usage-recommended_workflow: | .. _usage-recommended_workflow: | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,6 +1,9 @@ | |||||||
| <app-widget-frame [title]="savedView.title"> | <app-widget-frame [title]="savedView.title"> | ||||||
|  |  | ||||||
|   <table class="table table-sm table-hover table-borderless"> |   <a header-buttons [routerLink]="" (click)="showAll()">Show all</a> | ||||||
|  |  | ||||||
|  |  | ||||||
|  |   <table content class="table table-sm table-hover table-borderless"> | ||||||
|     <thead> |     <thead> | ||||||
|       <tr> |       <tr> | ||||||
|         <th>Created</th> |         <th>Created</th> | ||||||
|   | |||||||
| @@ -1,6 +1,8 @@ | |||||||
| import { Component, Input, OnInit } from '@angular/core'; | import { Component, Input, OnInit } from '@angular/core'; | ||||||
|  | import { Router } from '@angular/router'; | ||||||
| import { PaperlessDocument } from 'src/app/data/paperless-document'; | import { PaperlessDocument } from 'src/app/data/paperless-document'; | ||||||
| import { SavedViewConfig } from 'src/app/data/saved-view-config'; | import { SavedViewConfig } from 'src/app/data/saved-view-config'; | ||||||
|  | import { DocumentListViewService } from 'src/app/services/document-list-view.service'; | ||||||
| import { DocumentService } from 'src/app/services/rest/document.service'; | import { DocumentService } from 'src/app/services/rest/document.service'; | ||||||
|  |  | ||||||
| @Component({ | @Component({ | ||||||
| @@ -10,7 +12,10 @@ import { DocumentService } from 'src/app/services/rest/document.service'; | |||||||
| }) | }) | ||||||
| export class SavedViewWidgetComponent implements OnInit { | export class SavedViewWidgetComponent implements OnInit { | ||||||
|  |  | ||||||
|   constructor(private documentService: DocumentService) { } |   constructor( | ||||||
|  |     private documentService: DocumentService, | ||||||
|  |     private router: Router, | ||||||
|  |     private list: DocumentListViewService) { } | ||||||
|    |    | ||||||
|   @Input() |   @Input() | ||||||
|   savedView: SavedViewConfig |   savedView: SavedViewConfig | ||||||
| @@ -23,4 +28,9 @@ export class SavedViewWidgetComponent implements OnInit { | |||||||
|     }) |     }) | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   showAll() { | ||||||
|  |     this.list.load(this.savedView) | ||||||
|  |     this.router.navigate(["documents"]) | ||||||
|  |   } | ||||||
|  |  | ||||||
| } | } | ||||||
|   | |||||||
| @@ -1,4 +1,6 @@ | |||||||
| <app-widget-frame title="Statistics"> | <app-widget-frame title="Statistics"> | ||||||
|   <p class="card-text">Documents in inbox: {{statistics.documents_inbox}}</p> |   <ng-container content> | ||||||
|   <p class="card-text">Total documents: {{statistics.documents_total}}</p> |     <p class="card-text">Documents in inbox: {{statistics.documents_inbox}}</p> | ||||||
|  |     <p class="card-text">Total documents: {{statistics.documents_total}}</p> | ||||||
|  |   </ng-container> | ||||||
| </app-widget-frame> | </app-widget-frame> | ||||||
| @@ -1,6 +1,6 @@ | |||||||
| <app-widget-frame title="Upload new documents"> | <app-widget-frame title="Upload new documents"> | ||||||
|  |  | ||||||
|   <form> |   <form content> | ||||||
|     <ngx-file-drop  |     <ngx-file-drop  | ||||||
|       dropZoneLabel="Drop documents here or" (onFileDrop)="dropped($event)" |       dropZoneLabel="Drop documents here or" (onFileDrop)="dropped($event)" | ||||||
|       (onFileOver)="fileOver($event)" (onFileLeave)="fileLeave($event)" |       (onFileOver)="fileOver($event)" (onFileLeave)="fileLeave($event)" | ||||||
|   | |||||||
| @@ -1,8 +1,12 @@ | |||||||
| <div class="card mb-3 shadow"> | <div class="card mb-3 shadow"> | ||||||
|   <div class="card-header"> |   <div class="card-header"> | ||||||
|     <h5 class="card-title mb-0">{{title}}</h5> |     <div class="d-flex justify-content-between align-items-center"> | ||||||
|  |       <h5 class="card-title mb-0">{{title}}</h5> | ||||||
|  |       <ng-content select ="[header-buttons]"></ng-content> | ||||||
|  |     </div> | ||||||
|  |      | ||||||
|   </div> |   </div> | ||||||
|   <div class="card-body text-dark"> |   <div class="card-body text-dark"> | ||||||
|     <ng-content></ng-content> |     <ng-content select ="[content]"></ng-content> | ||||||
|   </div> |   </div> | ||||||
| </div> | </div> | ||||||
| @@ -9,9 +9,11 @@ | |||||||
|         <div class="d-flex justify-content-between align-items-center"> |         <div class="d-flex justify-content-between align-items-center"> | ||||||
|           <h5 class="card-title">     |           <h5 class="card-title">     | ||||||
|             <ng-container *ngIf="document.correspondent"> |             <ng-container *ngIf="document.correspondent"> | ||||||
|               <a [routerLink]="" title="Filter by correspondent" (click)="clickCorrespondent.emit(document.correspondent)" class="font-weight-bold">{{document.correspondent.name}}</a>: |               <a *ngIf="clickCorrespondent.observers.length ; else nolink" [routerLink]="" title="Filter by correspondent" (click)="clickCorrespondent.emit(document.correspondent)" class="font-weight-bold">{{document.correspondent.name}}</a> | ||||||
|  |               <ng-template #nolink>{{document.correspondent.name}}</ng-template>: | ||||||
|             </ng-container> |             </ng-container> | ||||||
|             {{document.title}}<app-tag [tag]="t" linkTitle="Filter by tag" *ngFor="let t of document.tags" class="ml-1" (click)="clickTag.emit(t)" [clickable]="true"></app-tag> |             {{document.title}} | ||||||
|  |             <app-tag [tag]="t" linkTitle="Filter by tag" *ngFor="let t of document.tags" class="ml-1" (click)="clickTag.emit(t)" [clickable]="clickTag.observers.length"></app-tag> | ||||||
|           </h5> |           </h5> | ||||||
|           <h5 class="card-title" *ngIf="document.archive_serial_number">#{{document.archive_serial_number}}</h5> |           <h5 class="card-title" *ngIf="document.archive_serial_number">#{{document.archive_serial_number}}</h5> | ||||||
|         </div> |         </div> | ||||||
|   | |||||||
| @@ -1,13 +1,21 @@ | |||||||
| <app-page-header title="Search results"> | <app-page-header title="Search results"> | ||||||
| </app-page-header> | </app-page-header> | ||||||
|  |  | ||||||
| <p>Search string: <i>{{query}}</i></p> | <div *ngIf="errorMessage" class="alert alert-danger">Invalid search query: {{errorMessage}}</div> | ||||||
|  |  | ||||||
| <div [class.result-content-searching]="searching" infiniteScroll (scrolled)="onScroll()"> | <p> | ||||||
|  |     Search string: <i>{{query}}</i> | ||||||
|  |     <ng-container *ngIf="correctedQuery"> | ||||||
|  |         - Did you mean "<a [routerLink]="" (click)="searchCorrectedQuery()">{{correctedQuery}}</a>"? | ||||||
|  |     </ng-container> | ||||||
|  |  | ||||||
|  | </p> | ||||||
|  |  | ||||||
|  | <div *ngIf="!errorMessage" [class.result-content-searching]="searching" infiniteScroll (scrolled)="onScroll()"> | ||||||
|     <p>{{resultCount}} result(s)</p> |     <p>{{resultCount}} result(s)</p> | ||||||
|     <app-document-card-large *ngFor="let result of results" |     <app-document-card-large *ngFor="let result of results" | ||||||
|         [document]="result.document" |         [document]="result.document" | ||||||
|         [details]="result.highlights"> |         [details]="result.highlights"> | ||||||
|  |  | ||||||
| </app-document-card-large> | </app-document-card-large> | ||||||
| </div> | </div> | ||||||
|   | |||||||
| @@ -1,5 +1,5 @@ | |||||||
| import { Component, OnInit } from '@angular/core'; | import { Component, OnInit } from '@angular/core'; | ||||||
| import { ActivatedRoute } from '@angular/router'; | import { ActivatedRoute, Router } from '@angular/router'; | ||||||
| import { SearchHit } from 'src/app/data/search-result'; | import { SearchHit } from 'src/app/data/search-result'; | ||||||
| import { SearchService } from 'src/app/services/rest/search.service'; | import { SearchService } from 'src/app/services/rest/search.service'; | ||||||
|  |  | ||||||
| @@ -9,7 +9,7 @@ import { SearchService } from 'src/app/services/rest/search.service'; | |||||||
|   styleUrls: ['./search.component.scss'] |   styleUrls: ['./search.component.scss'] | ||||||
| }) | }) | ||||||
| export class SearchComponent implements OnInit { | export class SearchComponent implements OnInit { | ||||||
|    |  | ||||||
|   results: SearchHit[] = [] |   results: SearchHit[] = [] | ||||||
|  |  | ||||||
|   query: string = "" |   query: string = "" | ||||||
| @@ -22,7 +22,11 @@ export class SearchComponent implements OnInit { | |||||||
|  |  | ||||||
|   resultCount |   resultCount | ||||||
|  |  | ||||||
|   constructor(private searchService: SearchService, private route: ActivatedRoute) { } |   correctedQuery: string = null | ||||||
|  |  | ||||||
|  |   errorMessage: string | ||||||
|  |  | ||||||
|  |   constructor(private searchService: SearchService, private route: ActivatedRoute, private router: Router) { } | ||||||
|  |  | ||||||
|   ngOnInit(): void { |   ngOnInit(): void { | ||||||
|     this.route.queryParamMap.subscribe(paramMap => { |     this.route.queryParamMap.subscribe(paramMap => { | ||||||
| @@ -31,10 +35,16 @@ export class SearchComponent implements OnInit { | |||||||
|       this.currentPage = 1 |       this.currentPage = 1 | ||||||
|       this.loadPage() |       this.loadPage() | ||||||
|     }) |     }) | ||||||
|      |  | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   searchCorrectedQuery() { | ||||||
|  |     this.router.navigate(["search"], {queryParams: {query: this.correctedQuery}}) | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   loadPage(append: boolean = false) { |   loadPage(append: boolean = false) { | ||||||
|  |     this.errorMessage = null | ||||||
|  |     this.correctedQuery = null | ||||||
|     this.searchService.search(this.query, this.currentPage).subscribe(result => { |     this.searchService.search(this.query, this.currentPage).subscribe(result => { | ||||||
|       if (append) { |       if (append) { | ||||||
|         this.results.push(...result.results) |         this.results.push(...result.results) | ||||||
| @@ -44,12 +54,17 @@ export class SearchComponent implements OnInit { | |||||||
|       this.pageCount = result.page_count |       this.pageCount = result.page_count | ||||||
|       this.searching = false |       this.searching = false | ||||||
|       this.resultCount = result.count |       this.resultCount = result.count | ||||||
|  |       this.correctedQuery = result.corrected_query | ||||||
|  |     }, error => { | ||||||
|  |       this.searching = false | ||||||
|  |       this.resultCount = 1 | ||||||
|  |       this.pageCount = 1 | ||||||
|  |       this.results = [] | ||||||
|  |       this.errorMessage = error.error | ||||||
|     }) |     }) | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   onScroll() { |   onScroll() { | ||||||
|     console.log(this.currentPage) |  | ||||||
|     console.log(this.pageCount) |  | ||||||
|     if (this.currentPage < this.pageCount) { |     if (this.currentPage < this.pageCount) { | ||||||
|       this.currentPage += 1 |       this.currentPage += 1 | ||||||
|       this.loadPage(true) |       this.loadPage(true) | ||||||
|   | |||||||
| @@ -21,7 +21,9 @@ export interface SearchResult { | |||||||
|   page?: number |   page?: number | ||||||
|   page_count?: number |   page_count?: number | ||||||
|  |  | ||||||
|  |   corrected_query?: string | ||||||
|  |  | ||||||
|   results?: SearchHit[] |   results?: SearchHit[] | ||||||
|  |  | ||||||
|  |  | ||||||
| } | } | ||||||
|   | |||||||
| @@ -10,10 +10,11 @@ from django.db.models import Q | |||||||
| from django.utils import timezone | from django.utils import timezone | ||||||
|  |  | ||||||
| from .classifier import DocumentClassifier, IncompatibleClassifierVersionError | from .classifier import DocumentClassifier, IncompatibleClassifierVersionError | ||||||
| from .file_handling import generate_filename, create_source_path_directory | from .file_handling import create_source_path_directory | ||||||
| from .loggers import LoggingMixin | from .loggers import LoggingMixin | ||||||
| from .models import Document, FileInfo, Correspondent, DocumentType, Tag | from .models import Document, FileInfo, Correspondent, DocumentType, Tag | ||||||
| from .parsers import ParseError, get_parser_class_for_mime_type, parse_date | from .parsers import ParseError, get_parser_class_for_mime_type, \ | ||||||
|  |     get_supported_file_extensions, parse_date | ||||||
| from .signals import ( | from .signals import ( | ||||||
|     document_consumption_finished, |     document_consumption_finished, | ||||||
|     document_consumption_started |     document_consumption_started | ||||||
| @@ -40,6 +41,21 @@ class Consumer(LoggingMixin): | |||||||
|             raise ConsumerError("Cannot consume {}: It is not a file".format( |             raise ConsumerError("Cannot consume {}: It is not a file".format( | ||||||
|                 self.path)) |                 self.path)) | ||||||
|  |  | ||||||
|  |     def pre_check_file_extension(self): | ||||||
|  |         extensions = get_supported_file_extensions() | ||||||
|  |         _, ext = os.path.splitext(self.filename) | ||||||
|  |  | ||||||
|  |         if not ext: | ||||||
|  |             raise ConsumerError( | ||||||
|  |                 f"Not consuming {self.filename}: File type unknown." | ||||||
|  |             ) | ||||||
|  |  | ||||||
|  |         if ext not in extensions: | ||||||
|  |             raise ConsumerError( | ||||||
|  |                 f"Not consuming {self.filename}: File extension {ext} does " | ||||||
|  |                 f"not map to any known file type ({str(extensions)})" | ||||||
|  |             ) | ||||||
|  |  | ||||||
|     def pre_check_duplicate(self): |     def pre_check_duplicate(self): | ||||||
|         with open(self.path, "rb") as f: |         with open(self.path, "rb") as f: | ||||||
|             checksum = hashlib.md5(f.read()).hexdigest() |             checksum = hashlib.md5(f.read()).hexdigest() | ||||||
| @@ -82,6 +98,7 @@ class Consumer(LoggingMixin): | |||||||
|         # Make sure that preconditions for consuming the file are met. |         # Make sure that preconditions for consuming the file are met. | ||||||
|  |  | ||||||
|         self.pre_check_file_exists() |         self.pre_check_file_exists() | ||||||
|  |         self.pre_check_file_extension() | ||||||
|         self.pre_check_directories() |         self.pre_check_directories() | ||||||
|         self.pre_check_duplicate() |         self.pre_check_duplicate() | ||||||
|  |  | ||||||
|   | |||||||
| @@ -4,10 +4,11 @@ from contextlib import contextmanager | |||||||
|  |  | ||||||
| from django.conf import settings | from django.conf import settings | ||||||
| from whoosh import highlight | from whoosh import highlight | ||||||
| from whoosh.fields import Schema, TEXT, NUMERIC | from whoosh.fields import Schema, TEXT, NUMERIC, KEYWORD, DATETIME | ||||||
| from whoosh.highlight import Formatter, get_text | from whoosh.highlight import Formatter, get_text | ||||||
| from whoosh.index import create_in, exists_in, open_dir | from whoosh.index import create_in, exists_in, open_dir | ||||||
| from whoosh.qparser import MultifieldParser | from whoosh.qparser import MultifieldParser | ||||||
|  | from whoosh.qparser.dateparse import DateParserPlugin | ||||||
| from whoosh.writing import AsyncWriter | from whoosh.writing import AsyncWriter | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -59,14 +60,19 @@ def get_schema(): | |||||||
|         id=NUMERIC(stored=True, unique=True, numtype=int), |         id=NUMERIC(stored=True, unique=True, numtype=int), | ||||||
|         title=TEXT(stored=True), |         title=TEXT(stored=True), | ||||||
|         content=TEXT(), |         content=TEXT(), | ||||||
|         correspondent=TEXT(stored=True) |         correspondent=TEXT(stored=True), | ||||||
|  |         tag=KEYWORD(stored=True, commas=True, scorable=True, lowercase=True), | ||||||
|  |         type=TEXT(stored=True), | ||||||
|  |         created=DATETIME(stored=True, sortable=True), | ||||||
|  |         modified=DATETIME(stored=True, sortable=True), | ||||||
|  |         added=DATETIME(stored=True, sortable=True), | ||||||
|     ) |     ) | ||||||
|  |  | ||||||
|  |  | ||||||
| def open_index(recreate=False): | def open_index(recreate=False): | ||||||
|     try: |     try: | ||||||
|         if exists_in(settings.INDEX_DIR) and not recreate: |         if exists_in(settings.INDEX_DIR) and not recreate: | ||||||
|             return open_dir(settings.INDEX_DIR) |             return open_dir(settings.INDEX_DIR, schema=get_schema()) | ||||||
|     except Exception as e: |     except Exception as e: | ||||||
|         logger.error(f"Error while opening the index: {e}, recreating.") |         logger.error(f"Error while opening the index: {e}, recreating.") | ||||||
|  |  | ||||||
| @@ -77,11 +83,17 @@ def open_index(recreate=False): | |||||||
|  |  | ||||||
| def update_document(writer, doc): | def update_document(writer, doc): | ||||||
|     logger.debug("Indexing {}...".format(doc)) |     logger.debug("Indexing {}...".format(doc)) | ||||||
|  |     tags = ",".join([t.name for t in doc.tags.all()]) | ||||||
|     writer.update_document( |     writer.update_document( | ||||||
|         id=doc.pk, |         id=doc.pk, | ||||||
|         title=doc.title, |         title=doc.title, | ||||||
|         content=doc.content, |         content=doc.content, | ||||||
|         correspondent=doc.correspondent.name if doc.correspondent else None |         correspondent=doc.correspondent.name if doc.correspondent else None, | ||||||
|  |         tag=tags if tags else None, | ||||||
|  |         type=doc.document_type.name if doc.document_type else None, | ||||||
|  |         created=doc.created, | ||||||
|  |         added=doc.added, | ||||||
|  |         modified=doc.modified, | ||||||
|     ) |     ) | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -103,16 +115,27 @@ def remove_document_from_index(document): | |||||||
|  |  | ||||||
|  |  | ||||||
| @contextmanager | @contextmanager | ||||||
| def query_page(ix, query, page): | def query_page(ix, querystring, page): | ||||||
|     searcher = ix.searcher() |     searcher = ix.searcher() | ||||||
|     try: |     try: | ||||||
|         query_parser = MultifieldParser(["content", "title", "correspondent"], |         qp = MultifieldParser( | ||||||
|                                         ix.schema).parse(query) |             ["content", "title", "correspondent", "tag", "type"], | ||||||
|         result_page = searcher.search_page(query_parser, page) |             ix.schema) | ||||||
|  |         qp.add_plugin(DateParserPlugin()) | ||||||
|  |  | ||||||
|  |         q = qp.parse(querystring) | ||||||
|  |         result_page = searcher.search_page(q, page) | ||||||
|         result_page.results.fragmenter = highlight.ContextFragmenter( |         result_page.results.fragmenter = highlight.ContextFragmenter( | ||||||
|             surround=50) |             surround=50) | ||||||
|         result_page.results.formatter = JsonFormatter() |         result_page.results.formatter = JsonFormatter() | ||||||
|         yield result_page |  | ||||||
|  |         corrected = searcher.correct_query(q, querystring) | ||||||
|  |         if corrected.query != q: | ||||||
|  |             corrected_query = corrected.string | ||||||
|  |         else: | ||||||
|  |             corrected_query = None | ||||||
|  |  | ||||||
|  |         yield result_page, corrected_query | ||||||
|     finally: |     finally: | ||||||
|         searcher.close() |         searcher.close() | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,7 +1,6 @@ | |||||||
| # coding=utf-8 | # coding=utf-8 | ||||||
|  |  | ||||||
| import logging | import logging | ||||||
| import mimetypes |  | ||||||
| import os | import os | ||||||
| import re | import re | ||||||
| from collections import OrderedDict | from collections import OrderedDict | ||||||
| @@ -12,6 +11,8 @@ from django.db import models | |||||||
| from django.utils import timezone | from django.utils import timezone | ||||||
| from django.utils.text import slugify | from django.utils.text import slugify | ||||||
|  |  | ||||||
|  | from documents.parsers import get_default_file_extension | ||||||
|  |  | ||||||
|  |  | ||||||
| class MatchingModel(models.Model): | class MatchingModel(models.Model): | ||||||
|  |  | ||||||
| @@ -204,7 +205,7 @@ class Document(models.Model): | |||||||
|         ordering = ("correspondent", "title") |         ordering = ("correspondent", "title") | ||||||
|  |  | ||||||
|     def __str__(self): |     def __str__(self): | ||||||
|         created = self.created.strftime("%Y%m%d%H%M%S") |         created = self.created.strftime("%Y%m%d") | ||||||
|         if self.correspondent and self.title: |         if self.correspondent and self.title: | ||||||
|             return "{}: {} - {}".format( |             return "{}: {} - {}".format( | ||||||
|                 created, self.correspondent, self.title) |                 created, self.correspondent, self.title) | ||||||
| @@ -255,8 +256,7 @@ class Document(models.Model): | |||||||
|  |  | ||||||
|     @property |     @property | ||||||
|     def file_type(self): |     def file_type(self): | ||||||
|         # TODO: this is not stable across python versions |         return get_default_file_extension(self.mime_type) | ||||||
|         return mimetypes.guess_extension(str(self.mime_type)) |  | ||||||
|  |  | ||||||
|     @property |     @property | ||||||
|     def thumbnail_path(self): |     def thumbnail_path(self): | ||||||
|   | |||||||
| @@ -1,4 +1,5 @@ | |||||||
| import logging | import logging | ||||||
|  | import mimetypes | ||||||
| import os | import os | ||||||
| import re | import re | ||||||
| import shutil | import shutil | ||||||
| @@ -42,6 +43,29 @@ def is_mime_type_supported(mime_type): | |||||||
|     return get_parser_class_for_mime_type(mime_type) is not None |     return get_parser_class_for_mime_type(mime_type) is not None | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def get_default_file_extension(mime_type): | ||||||
|  |     for response in document_consumer_declaration.send(None): | ||||||
|  |         parser_declaration = response[1] | ||||||
|  |         supported_mime_types = parser_declaration["mime_types"] | ||||||
|  |  | ||||||
|  |         if mime_type in supported_mime_types: | ||||||
|  |             return supported_mime_types[mime_type] | ||||||
|  |  | ||||||
|  |     return None | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def get_supported_file_extensions(): | ||||||
|  |     extensions = set() | ||||||
|  |     for response in document_consumer_declaration.send(None): | ||||||
|  |         parser_declaration = response[1] | ||||||
|  |         supported_mime_types = parser_declaration["mime_types"] | ||||||
|  |  | ||||||
|  |         for mime_type in supported_mime_types: | ||||||
|  |             extensions.update(mimetypes.guess_all_extensions(mime_type)) | ||||||
|  |  | ||||||
|  |     return extensions | ||||||
|  |  | ||||||
|  |  | ||||||
| def get_parser_class_for_mime_type(mime_type): | def get_parser_class_for_mime_type(mime_type): | ||||||
|  |  | ||||||
|     options = [] |     options = [] | ||||||
|   | |||||||
| @@ -325,6 +325,22 @@ class DocumentApiTest(DirectoriesMixin, APITestCase): | |||||||
|         self.assertEqual(response.status_code, 200) |         self.assertEqual(response.status_code, 200) | ||||||
|         self.assertEqual(len(response.data), 10) |         self.assertEqual(len(response.data), 10) | ||||||
|  |  | ||||||
|  |     def test_search_spelling_correction(self): | ||||||
|  |         with AsyncWriter(index.open_index()) as writer: | ||||||
|  |             for i in range(55): | ||||||
|  |                 doc = Document.objects.create(checksum=str(i), pk=i+1, title=f"Document {i+1}", content=f"Things document {i+1}") | ||||||
|  |                 index.update_document(writer, doc) | ||||||
|  |  | ||||||
|  |         response = self.client.get("/api/search/?query=thing") | ||||||
|  |         correction = response.data['corrected_query'] | ||||||
|  |  | ||||||
|  |         self.assertEqual(correction, "things") | ||||||
|  |  | ||||||
|  |         response = self.client.get("/api/search/?query=things") | ||||||
|  |         correction = response.data['corrected_query'] | ||||||
|  |  | ||||||
|  |         self.assertEqual(correction, None) | ||||||
|  |  | ||||||
|     def test_statistics(self): |     def test_statistics(self): | ||||||
|  |  | ||||||
|         doc1 = Document.objects.create(title="none1", checksum="A") |         doc1 = Document.objects.create(title="none1", checksum="A") | ||||||
|   | |||||||
| @@ -425,7 +425,7 @@ class TestConsumer(DirectoriesMixin, TestCase): | |||||||
|         m = patcher.start() |         m = patcher.start() | ||||||
|         m.return_value = [(None, { |         m.return_value = [(None, { | ||||||
|             "parser": self.make_dummy_parser, |             "parser": self.make_dummy_parser, | ||||||
|             "mime_types": ["application/pdf"], |             "mime_types": {"application/pdf": ".pdf"}, | ||||||
|             "weight": 0 |             "weight": 0 | ||||||
|         })] |         })] | ||||||
|  |  | ||||||
| @@ -551,7 +551,7 @@ class TestConsumer(DirectoriesMixin, TestCase): | |||||||
|         try: |         try: | ||||||
|             self.consumer.try_consume_file(self.get_test_file()) |             self.consumer.try_consume_file(self.get_test_file()) | ||||||
|         except ConsumerError as e: |         except ConsumerError as e: | ||||||
|             self.assertTrue(str(e).startswith("No parsers abvailable")) |             self.assertTrue("File extension .pdf does not map to any" in str(e)) | ||||||
|             return |             return | ||||||
|  |  | ||||||
|         self.fail("Should throw exception") |         self.fail("Should throw exception") | ||||||
| @@ -560,7 +560,7 @@ class TestConsumer(DirectoriesMixin, TestCase): | |||||||
|     def testFaultyParser(self, m): |     def testFaultyParser(self, m): | ||||||
|         m.return_value = [(None, { |         m.return_value = [(None, { | ||||||
|             "parser": self.make_faulty_parser, |             "parser": self.make_faulty_parser, | ||||||
|             "mime_types": ["application/pdf"], |             "mime_types": {"application/pdf": ".pdf"}, | ||||||
|             "weight": 0 |             "weight": 0 | ||||||
|         })] |         })] | ||||||
|  |  | ||||||
|   | |||||||
| @@ -6,7 +6,10 @@ from unittest import mock | |||||||
|  |  | ||||||
| from django.test import TestCase, override_settings | from django.test import TestCase, override_settings | ||||||
|  |  | ||||||
| from documents.parsers import get_parser_class, DocumentParser | from documents.parsers import get_parser_class, get_supported_file_extensions, get_default_file_extension, \ | ||||||
|  |     get_parser_class_for_mime_type, DocumentParser | ||||||
|  | from paperless_tesseract.parsers import RasterisedDocumentParser | ||||||
|  | from paperless_text.parsers import TextDocumentParser | ||||||
|  |  | ||||||
|  |  | ||||||
| def fake_magic_from_file(file, mime=False): | def fake_magic_from_file(file, mime=False): | ||||||
| @@ -29,7 +32,7 @@ class TestParserDiscovery(TestCase): | |||||||
|             pass |             pass | ||||||
|  |  | ||||||
|         m.return_value = ( |         m.return_value = ( | ||||||
|             (None, {"weight": 0, "parser": DummyParser, "mime_types": ["application/pdf"]}), |             (None, {"weight": 0, "parser": DummyParser, "mime_types": {"application/pdf": ".pdf"}}), | ||||||
|         ) |         ) | ||||||
|  |  | ||||||
|         self.assertEqual( |         self.assertEqual( | ||||||
| @@ -47,8 +50,8 @@ class TestParserDiscovery(TestCase): | |||||||
|             pass |             pass | ||||||
|  |  | ||||||
|         m.return_value = ( |         m.return_value = ( | ||||||
|             (None, {"weight": 0, "parser": DummyParser1, "mime_types": ["application/pdf"]}), |             (None, {"weight": 0, "parser": DummyParser1, "mime_types": {"application/pdf": ".pdf"}}), | ||||||
|             (None, {"weight": 1, "parser": DummyParser2, "mime_types": ["application/pdf"]}), |             (None, {"weight": 1, "parser": DummyParser2, "mime_types": {"application/pdf": ".pdf"}}), | ||||||
|         ) |         ) | ||||||
|  |  | ||||||
|         self.assertEqual( |         self.assertEqual( | ||||||
| @@ -96,3 +99,20 @@ class TestBaseParser(TestCase): | |||||||
|         path = parser.get_optimised_thumbnail("any", "not important") |         path = parser.get_optimised_thumbnail("any", "not important") | ||||||
|         self.assertEqual(path, fake_get_thumbnail(None, None, None)) |         self.assertEqual(path, fake_get_thumbnail(None, None, None)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class TestParserAvailability(TestCase): | ||||||
|  |  | ||||||
|  |     def test_file_extensions(self): | ||||||
|  |  | ||||||
|  |         for ext in [".pdf", ".jpe", ".jpg", ".jpeg", ".txt", ".csv"]: | ||||||
|  |             self.assertIn(ext, get_supported_file_extensions()) | ||||||
|  |         self.assertEqual(get_default_file_extension('application/pdf'), ".pdf") | ||||||
|  |         self.assertEqual(get_default_file_extension('image/png'), ".png") | ||||||
|  |         self.assertEqual(get_default_file_extension('image/jpeg'), ".jpg") | ||||||
|  |         self.assertEqual(get_default_file_extension('text/plain'), ".txt") | ||||||
|  |         self.assertEqual(get_default_file_extension('text/csv'), ".csv") | ||||||
|  |         self.assertEqual(get_default_file_extension('aasdasd/dgfgf'), None) | ||||||
|  |  | ||||||
|  |         self.assertEqual(get_parser_class_for_mime_type('application/pdf'), RasterisedDocumentParser) | ||||||
|  |         self.assertEqual(get_parser_class_for_mime_type('text/plain'), TextDocumentParser) | ||||||
|  |         self.assertEqual(get_parser_class_for_mime_type('text/sdgsdf'), None) | ||||||
|   | |||||||
| @@ -236,30 +236,34 @@ class SearchView(APIView): | |||||||
|                 } |                 } | ||||||
|  |  | ||||||
|     def get(self, request, format=None): |     def get(self, request, format=None): | ||||||
|         if 'query' in request.query_params: |         if 'query' not in request.query_params: | ||||||
|             query = request.query_params['query'] |  | ||||||
|             try: |  | ||||||
|                 page = int(request.query_params.get('page', 1)) |  | ||||||
|             except (ValueError, TypeError): |  | ||||||
|                 page = 1 |  | ||||||
|  |  | ||||||
|             if page < 1: |  | ||||||
|                 page = 1 |  | ||||||
|  |  | ||||||
|             with index.query_page(self.ix, query, page) as result_page: |  | ||||||
|                 return Response( |  | ||||||
|                     {'count': len(result_page), |  | ||||||
|                      'page': result_page.pagenum, |  | ||||||
|                      'page_count': result_page.pagecount, |  | ||||||
|                      'results': list(map(self.add_infos_to_hit, result_page))}) |  | ||||||
|  |  | ||||||
|         else: |  | ||||||
|             return Response({ |             return Response({ | ||||||
|                 'count': 0, |                 'count': 0, | ||||||
|                 'page': 0, |                 'page': 0, | ||||||
|                 'page_count': 0, |                 'page_count': 0, | ||||||
|                 'results': []}) |                 'results': []}) | ||||||
|  |  | ||||||
|  |         query = request.query_params['query'] | ||||||
|  |         try: | ||||||
|  |             page = int(request.query_params.get('page', 1)) | ||||||
|  |         except (ValueError, TypeError): | ||||||
|  |             page = 1 | ||||||
|  |  | ||||||
|  |         if page < 1: | ||||||
|  |             page = 1 | ||||||
|  |  | ||||||
|  |         try: | ||||||
|  |             with index.query_page(self.ix, query, page) as (result_page, | ||||||
|  |                                                             corrected_query): | ||||||
|  |                 return Response( | ||||||
|  |                     {'count': len(result_page), | ||||||
|  |                      'page': result_page.pagenum, | ||||||
|  |                      'page_count': result_page.pagecount, | ||||||
|  |                      'corrected_query': corrected_query, | ||||||
|  |                      'results': list(map(self.add_infos_to_hit, result_page))}) | ||||||
|  |         except Exception as e: | ||||||
|  |             return HttpResponseBadRequest(str(e)) | ||||||
|  |  | ||||||
|  |  | ||||||
| class SearchAutoCompleteView(APIView): | class SearchAutoCompleteView(APIView): | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1 +1 @@ | |||||||
| __version__ = (0, 9, 3) | __version__ = (0, 9, 4) | ||||||
|   | |||||||
| @@ -5,9 +5,9 @@ def tesseract_consumer_declaration(sender, **kwargs): | |||||||
|     return { |     return { | ||||||
|         "parser": RasterisedDocumentParser, |         "parser": RasterisedDocumentParser, | ||||||
|         "weight": 0, |         "weight": 0, | ||||||
|         "mime_types": [ |         "mime_types": { | ||||||
|             "application/pdf", |             "application/pdf": ".pdf", | ||||||
|             "image/jpeg", |             "image/jpeg": ".jpg", | ||||||
|             "image/png" |             "image/png": ".png" | ||||||
|         ] |         } | ||||||
|     } |     } | ||||||
|   | |||||||
| @@ -5,8 +5,8 @@ def text_consumer_declaration(sender, **kwargs): | |||||||
|     return { |     return { | ||||||
|         "parser": TextDocumentParser, |         "parser": TextDocumentParser, | ||||||
|         "weight": 10, |         "weight": 10, | ||||||
|         "mime_types": [ |         "mime_types": { | ||||||
|             "text/plain", |             "text/plain": ".txt", | ||||||
|             "text/comma-separated-values" |             "text/csv": ".csv", | ||||||
|         ] |         } | ||||||
|     } |     } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 jonaswinkler
					jonaswinkler