Merge remote-tracking branch 'upstream/dev' into feature-bulk-editor

2025-12-16 01:31:09 -06:00 · 2020-12-20 07:49:27 -08:00
parent fa7b90a584 c6b9e2b544
commit f06e2c1089
35 changed files with 657 additions and 658 deletions
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -221,21 +221,16 @@ Each fragment contains a list of strings, and some of them are marked as a highl

    [
        [
-            {"text": "This is a sample text with a "},
-            {"text": "highlighted", "term": 0},
-            {"text": " word."}
+            {"text": "This is a sample text with a ", "highlight": false},
+            {"text": "highlighted", "highlight": true},
+            {"text": " word.", "highlight": false}
        ],
        [
-            {"text": "Another", "term": 1},
-            {"text": " fragment with a highlight."}
+            {"text": "Another", "highlight": true},
+            {"text": " fragment with a highlight.", "highlight": false}
        ]
    ]

-
-
-When ``term`` is present within a string, the word within ``text`` should be highlighted.
-The term index groups multiple matches together and words with the same index
-should get identical highlighting.
 A client may use this example to produce the following output:

 ... This is a sample text with a **highlighted** word. ... **Another** fragment with a highlight. ...
--- a/src-ui/src/app/components/document-detail/document-detail.component.html
+++ b/src-ui/src/app/components/document-detail/document-detail.component.html
@@ -1,4 +1,14 @@
 <app-page-header [(title)]="title">
+    <div class="input-group input-group-sm mr-5" *ngIf="getContentType() == 'application/pdf'">
+      <div class="input-group-prepend">
+        <div class="input-group-text">Page </div>
+      </div>
+      <input class="form-control flex-grow-0 w-auto" type="number" min="1" [max]="previewNumPages" [(ngModel)]="previewCurrentPage" />
+      <div class="input-group-append">
+        <div class="input-group-text">of {{previewNumPages}}</div>
+      </div>
+    </div>
+
    <button type="button" class="btn btn-sm btn-outline-danger mr-2" (click)="delete()">
        <svg class="buttonicon" fill="currentColor">
            <use xlink:href="assets/bootstrap-icons.svg#trash" />
@@ -24,6 +34,12 @@

    </div>

+    <button type="button" class="btn btn-sm btn-outline-primary mr-2" (click)="moreLike()">
+        <svg class="buttonicon" fill="currentColor">
+            <use xlink:href="assets/bootstrap-icons.svg#three-dots" />
+        </svg>
+        <span class="d-none d-lg-inline"> More like this</span>
+    </button>

    <button type="button" class="btn btn-sm btn-outline-primary" (click)="close()">
        <svg class="buttonicon" fill="currentColor">
@@ -128,7 +144,7 @@

    <div class="col-md-6 col-xl-8 mb-3">
      <div class="pdf-viewer-container" *ngIf="getContentType() == 'application/pdf'">
-        <pdf-viewer [src]="previewUrl" [original-size]="false" [show-borders]="true"></pdf-viewer>
+        <pdf-viewer [src]="previewUrl" [original-size]="false" [show-borders]="true" [show-all]="true" [(page)]="previewCurrentPage" (after-load-complete)="pdfPreviewLoaded($event)"></pdf-viewer>
      </div>
    </div>
 </div>
--- a/src-ui/src/app/components/document-detail/document-detail.component.ts
+++ b/src-ui/src/app/components/document-detail/document-detail.component.ts
@@ -15,6 +15,7 @@ import { DocumentService } from 'src/app/services/rest/document.service';
 import { ConfirmDialogComponent } from '../common/confirm-dialog/confirm-dialog.component';
 import { CorrespondentEditDialogComponent } from '../manage/correspondent-list/correspondent-edit-dialog/correspondent-edit-dialog.component';
 import { DocumentTypeEditDialogComponent } from '../manage/document-type-list/document-type-edit-dialog/document-type-edit-dialog.component';
+import { PDFDocumentProxy } from 'ng2-pdf-viewer';

@Component({
  selector: 'app-document-detail',
@@ -47,6 +48,9 @@ export class DocumentDetailComponent implements OnInit {
    tags: new FormControl([])
  })

+  previewCurrentPage: number = 1
+  previewNumPages: number = 1
+
  constructor(
    private documentsService: DocumentService,
    private route: ActivatedRoute,
@@ -168,7 +172,16 @@ export class DocumentDetailComponent implements OnInit {

  }

+  moreLike() {
+    this.router.navigate(["search"], {queryParams: {more_like:this.document.id}})
+  }
+
  hasNext() {
    return this.documentListViewService.hasNext(this.documentId)
  }
+
+  pdfPreviewLoaded(pdf: PDFDocumentProxy) {
+    this.previewNumPages = pdf.numPages
+  }
+
 }
--- a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html
+++ b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html
@@ -23,8 +23,14 @@
        </p>


-        <div class="d-flex justify-content-between align-items-center">
+        <div class="d-flex align-items-center">
          <div class="btn-group">
+            <a routerLink="/search" [queryParams]="{'more_like': document.id}" class="btn btn-sm btn-outline-secondary" *ngIf="moreLikeThis">
+              <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-three-dots" viewBox="0 0 16 16">
+                <path fill-rule="evenodd" d="M3 9.5a1.5 1.5 0 1 1 0-3 1.5 1.5 0 0 1 0 3zm5 0a1.5 1.5 0 1 1 0-3 1.5 1.5 0 0 1 0 3zm5 0a1.5 1.5 0 1 1 0-3 1.5 1.5 0 0 1 0 3z"/>
+              </svg>
+              More like this
+            </a>
            <a routerLink="/documents/{{document.id}}" class="btn btn-sm btn-outline-secondary">
              <svg width="1em" height="1em" viewBox="0 0 16 16" class="bi bi-pencil" fill="currentColor" xmlns="http://www.w3.org/2000/svg">
                <path fill-rule="evenodd" d="M12.146.146a.5.5 0 0 1 .708 0l3 3a.5.5 0 0 1 0 .708l-10 10a.5.5 0 0 1-.168.11l-5 2a.5.5 0 0 1-.65-.65l2-5a.5.5 0 0 1 .11-.168l10-10zM11.207 2.5L13.5 4.793 14.793 3.5 12.5 1.207 11.207 2.5zm1.586 3L10.5 3.207 4 9.707V10h.5a.5.5 0 0 1 .5.5v.5h.5a.5.5 0 0 1 .5.5v.5h.293l6.5-6.5zm-9.761 5.175l-.106.106-1.528 3.821 3.821-1.528.106-.106A.5.5 0 0 1 5 12.5V12h-.5a.5.5 0 0 1-.5-.5V11h-.5a.5.5 0 0 1-.468-.325z"/>
@@ -45,7 +51,13 @@
              </svg>
              Download
            </a>
+            
          </div>
+
+          <small class="text-muted ml-auto">Score:</small>
+
+          <ngb-progressbar *ngIf="searchScore" [type]="searchScoreClass" [value]="searchScore" class="search-score-bar mx-2" [max]="1"></ngb-progressbar>
+          
          <small class="text-muted">Created: {{document.created | date}}</small>
        </div>
        
--- a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.scss
+++ b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.scss
@@ -10,3 +10,9 @@
  position: absolute;

 }
+
+.search-score-bar {
+  width: 100px;
+  height: 5px;
+  margin-top: 2px;
+}
--- a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.ts
+++ b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.ts
@@ -12,6 +12,9 @@ export class DocumentCardLargeComponent implements OnInit {

  constructor(private documentService: DocumentService, private sanitizer: DomSanitizer) { }

+  @Input()
+  moreLikeThis: boolean = false
+
  @Input()
  document: PaperlessDocument

@@ -24,6 +27,19 @@ export class DocumentCardLargeComponent implements OnInit {
  @Output()
  clickCorrespondent = new EventEmitter<number>()

+  @Input()
+  searchScore: number
+
+  get searchScoreClass() {
+    if (this.searchScore > 0.7) {
+      return "success"
+    } else if (this.searchScore > 0.3) {
+      return "warning"
+    } else {
+      return "danger"
+    }
+  }
+
  ngOnInit(): void {
  }

--- a/src-ui/src/app/components/document-list/document-card-small/document-card-small.component.html
+++ b/src-ui/src/app/components/document-list/document-card-small/document-card-small.component.html
@@ -1,4 +1,4 @@
-<div class="col p-2 h-100 document-card" style="width: 16rem;">
+<div class="col p-2 h-100">
  <div class="card h-100 shadow-sm" [class.card-selected]="selected">
    <div class="border-bottom" [class.doc-img-background-selected]="selected">
      <img class="card-img doc-img" [src]="getThumbUrl()" (click)="selected = !selected">
--- a/src-ui/src/app/components/document-list/document-list.component.html
+++ b/src-ui/src/app/components/document-list/document-list.component.html
@@ -151,5 +151,5 @@


 <div class=" m-n2 row" *ngIf="displayMode == 'smallCards'">
-  <app-document-card-small [selected]="list.isSelected(d)" (selectedChange)="list.setSelected(d, $event)" [document]="d" *ngFor="let d of list.documents" (clickTag)="clickTag($event)" (clickCorrespondent)="clickCorrespondent($event)"></app-document-card-small>
+  <app-document-card-small [document]="d" [selected]="list.isSelected(d)" (selectedChange)="list.setSelected(d, $event)" *ngFor="let d of list.documents" (clickTag)="clickTag($event)" (clickCorrespondent)="clickCorrespondent($event)"></app-document-card-small>
 </div>
--- a/src-ui/src/app/components/search/result-highlight/result-highlight.component.html
+++ b/src-ui/src/app/components/search/result-highlight/result-highlight.component.html
@@ -1,3 +1,3 @@
 ... <span *ngFor="let fragment of highlights">
-    <span *ngFor="let token of fragment" [ngClass]="token.term != null ? 'match term'+ token.term : ''">{{token.text}}</span> ... 
+    <span *ngFor="let token of fragment" [class.match]="token.highlight">{{token.text}}</span> ... 
 </span>
--- a/src-ui/src/app/components/search/result-highlight/result-highlight.component.scss
+++ b/src-ui/src/app/components/search/result-highlight/result-highlight.component.scss
@@ -1,4 +1,4 @@
 .match {
    color: black;
-    background-color: orange;
+    background-color: rgb(255, 211, 66);
 }
--- a/src-ui/src/app/components/search/search.component.html
+++ b/src-ui/src/app/components/search/search.component.html
@@ -3,7 +3,12 @@

 <div *ngIf="errorMessage" class="alert alert-danger">Invalid search query: {{errorMessage}}</div>

-<p>
+<p *ngIf="more_like">
+    Showing documents similar to
+    <a routerLink="/documents/{{more_like}}">{{more_like_doc?.original_file_name}}</a>
+</p>
+
+<p *ngIf="query">
    Search string: <i>{{query}}</i>
    <ng-container *ngIf="correctedQuery">
        - Did you mean "<a [routerLink]="" (click)="searchCorrectedQuery()">{{correctedQuery}}</a>"?
@@ -15,7 +20,9 @@
    <p>{{resultCount}} result(s)</p>
    <app-document-card-large *ngFor="let result of results"
        [document]="result.document"
-        [details]="result.highlights">
+        [details]="result.highlights"
+        [searchScore]="result.score / maxScore"
+        [moreLikeThis]="true">

 </app-document-card-large>
 </div>
--- a/src-ui/src/app/components/search/search.component.ts
+++ b/src-ui/src/app/components/search/search.component.ts
@@ -1,6 +1,9 @@
 import { Component, OnInit } from '@angular/core';
 import { ActivatedRoute, Router } from '@angular/router';
+import { PaperlessDocument } from 'src/app/data/paperless-document';
+import { PaperlessDocumentType } from 'src/app/data/paperless-document-type';
 import { SearchHit } from 'src/app/data/search-result';
+import { DocumentService } from 'src/app/services/rest/document.service';
 import { SearchService } from 'src/app/services/rest/search.service';

@Component({
@@ -14,6 +17,10 @@ export class SearchComponent implements OnInit {

  query: string = ""

+  more_like: number
+
+  more_like_doc: PaperlessDocument
+
  searching = false

  currentPage = 1
@@ -26,11 +33,24 @@ export class SearchComponent implements OnInit {

  errorMessage: string

-  constructor(private searchService: SearchService, private route: ActivatedRoute, private router: Router) { }
+  get maxScore() {
+    return this.results?.length > 0 ? this.results[0].score : 100
+  }
+
+  constructor(private searchService: SearchService, private route: ActivatedRoute, private router: Router, private documentService: DocumentService) { }

  ngOnInit(): void {
    this.route.queryParamMap.subscribe(paramMap => {
+      window.scrollTo(0, 0)
      this.query = paramMap.get('query')
+      this.more_like = paramMap.has('more_like') ? +paramMap.get('more_like') : null
+      if (this.more_like) {
+        this.documentService.get(this.more_like).subscribe(r => {
+          this.more_like_doc = r
+        })
+      } else {
+        this.more_like_doc = null
+      }
      this.searching = true
      this.currentPage = 1
      this.loadPage()
@@ -39,13 +59,14 @@ export class SearchComponent implements OnInit {
  }

  searchCorrectedQuery() {
-    this.router.navigate(["search"], {queryParams: {query: this.correctedQuery}})
+    this.router.navigate(["search"], {queryParams: {query: this.correctedQuery, more_like: this.more_like}})
  }

  loadPage(append: boolean = false) {
    this.errorMessage = null
    this.correctedQuery = null
-    this.searchService.search(this.query, this.currentPage).subscribe(result => {
+
+    this.searchService.search(this.query, this.currentPage, this.more_like).subscribe(result => {
      if (append) {
        this.results.push(...result.results)
      } else {
--- a/src-ui/src/app/services/rest/search.service.ts
+++ b/src-ui/src/app/services/rest/search.service.ts
@@ -15,11 +15,17 @@ export class SearchService {
  
  constructor(private http: HttpClient, private documentService: DocumentService) { }

-  search(query: string, page?: number): Observable<SearchResult> {
-    let httpParams = new HttpParams().set('query', query)
+  search(query: string, page?: number, more_like?: number): Observable<SearchResult> {
+    let httpParams = new HttpParams()
+    if (query) {
+      httpParams = httpParams.set('query', query)
+    }
    if (page) {
      httpParams = httpParams.set('page', page.toString())
    }
+    if (more_like) {
+      httpParams = httpParams.set('more_like', more_like.toString())
+    }
    return this.http.get<SearchResult>(`${environment.apiBaseUrl}search/`, {params: httpParams}).pipe(
      map(result => {
        result.results.forEach(hit => this.documentService.addObservablesToDocument(hit.document))
--- a/src-ui/src/environments/environment.ts
+++ b/src-ui/src/environments/environment.ts
@@ -5,7 +5,8 @@
 export const environment = {
  production: false,
  apiBaseUrl: "http://localhost:8000/api/",
-  appTitle: "DEVELOPMENT P-NG"
+  appTitle: "Paperless-ng",
+  version: "DEVELOPMENT"
 };

 /*
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -247,7 +247,6 @@ class Consumer(LoggingMixin):

        with open(self.path, "rb") as f:
            document = Document.objects.create(
-                correspondent=file_info.correspondent,
                title=(self.override_title or file_info.title)[:127],
                content=text,
                mime_type=mime_type,
@@ -257,12 +256,6 @@ class Consumer(LoggingMixin):
                storage_type=storage_type
            )

-        relevant_tags = set(file_info.tags)
-        if relevant_tags:
-            tag_names = ", ".join([t.name for t in relevant_tags])
-            self.log("debug", "Tagging with {}".format(tag_names))
-            document.tags.add(*relevant_tags)
-
        self.apply_overrides(document)

        document.save()
--- a/src/documents/index.py
+++ b/src/documents/index.py
@@ -3,7 +3,7 @@ import os
 from contextlib import contextmanager

 from django.conf import settings
-from whoosh import highlight
+from whoosh import highlight, classify, query
 from whoosh.fields import Schema, TEXT, NUMERIC, KEYWORD, DATETIME
 from whoosh.highlight import Formatter, get_text
 from whoosh.index import create_in, exists_in, open_dir
@@ -20,32 +20,37 @@ class JsonFormatter(Formatter):
        self.seen = {}

    def format_token(self, text, token, replace=False):
-        seen = self.seen
        ttext = self._text(get_text(text, token, replace))
-        if ttext in seen:
-            termnum = seen[ttext]
-        else:
-            termnum = len(seen)
-            seen[ttext] = termnum
-
-        return {'text': ttext, 'term': termnum}
+        return {'text': ttext, 'highlight': 'true'}

    def format_fragment(self, fragment, replace=False):
        output = []
        index = fragment.startchar
        text = fragment.text
-
+        amend_token = None
        for t in fragment.matches:
            if t.startchar is None:
                continue
            if t.startchar < index:
                continue
            if t.startchar > index:
-                output.append({'text': text[index:t.startchar]})
-            output.append(self.format_token(text, t, replace))
+                text_inbetween = text[index:t.startchar]
+                if amend_token and t.startchar - index < 10:
+                    amend_token['text'] += text_inbetween
+                else:
+                    output.append({'text': text_inbetween,
+                                   'highlight': False})
+                    amend_token = None
+            token = self.format_token(text, t, replace)
+            if amend_token:
+                amend_token['text'] += token['text']
+            else:
+                output.append(token)
+                amend_token = token
            index = t.endchar
        if index < fragment.endchar:
-            output.append({'text': text[index:fragment.endchar]})
+            output.append({'text': text[index:fragment.endchar],
+                           'highlight': False})
        return output

    def format(self, fragments, replace=False):
@@ -120,22 +125,42 @@ def remove_document_from_index(document):


@contextmanager
-def query_page(ix, querystring, page):
+def query_page(ix, page, querystring, more_like_doc_id, more_like_doc_content):
    searcher = ix.searcher()
    try:
+        if querystring:
            qp = MultifieldParser(
                ["content", "title", "correspondent", "tag", "type"],
                ix.schema)
            qp.add_plugin(DateParserPlugin())
+            str_q = qp.parse(querystring)
+            corrected = searcher.correct_query(str_q, querystring)
+        else:
+            str_q = None
+            corrected = None
+
+        if more_like_doc_id:
+            docnum = searcher.document_number(id=more_like_doc_id)
+            kts = searcher.key_terms_from_text(
+                'content', more_like_doc_content, numterms=20,
+                model=classify.Bo1Model, normalize=False)
+            more_like_q = query.Or(
+                [query.Term('content', word, boost=weight)
+                 for word, weight in kts])
+            result_page = searcher.search_page(
+                more_like_q, page, filter=str_q, mask={docnum})
+        elif str_q:
+            result_page = searcher.search_page(str_q, page)
+        else:
+            raise ValueError(
+                "Either querystring or more_like_doc_id is required."
+            )

-        q = qp.parse(querystring)
-        result_page = searcher.search_page(q, page)
        result_page.results.fragmenter = highlight.ContextFragmenter(
            surround=50)
        result_page.results.formatter = JsonFormatter()

-        corrected = searcher.correct_query(q, querystring)
-        if corrected.query != q:
+        if corrected and corrected.query != str_q:
            corrected_query = corrected.string
        else:
            corrected_query = None
--- a/src/documents/migrations/1003_mime_types.py
+++ b/src/documents/migrations/1003_mime_types.py
@@ -11,6 +11,7 @@ from paperless.db import GnuPG
 STORAGE_TYPE_UNENCRYPTED = "unencrypted"
 STORAGE_TYPE_GPG = "gpg"

+
 def source_path(self):
    if self.filename:
        fname = str(self.filename)
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -357,54 +357,12 @@ class SavedViewFilterRule(models.Model):
 # TODO: why is this in the models file?
 class FileInfo:

-    # This epic regex *almost* worked for our needs, so I'm keeping it here for
-    # posterity, in the hopes that we might find a way to make it work one day.
-    ALMOST_REGEX = re.compile(
-        r"^((?P<date>\d\d\d\d\d\d\d\d\d\d\d\d\d\dZ){separator})?"
-        r"((?P<correspondent>{non_separated_word}+){separator})??"
-        r"(?P<title>{non_separated_word}+)"
-        r"({separator}(?P<tags>[a-z,0-9-]+))?"
-        r"\.(?P<extension>[a-zA-Z.-]+)$".format(
-            separator=r"\s+-\s+",
-            non_separated_word=r"([\w,. ]|([^\s]-))"
-        )
-    )
    REGEXES = OrderedDict([
-        ("created-correspondent-title-tags", re.compile(
-            r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
-            r"(?P<correspondent>.*) - "
-            r"(?P<title>.*) - "
-            r"(?P<tags>[a-z0-9\-,]*)$",
-            flags=re.IGNORECASE
-        )),
-        ("created-title-tags", re.compile(
-            r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
-            r"(?P<title>.*) - "
-            r"(?P<tags>[a-z0-9\-,]*)$",
-            flags=re.IGNORECASE
-        )),
-        ("created-correspondent-title", re.compile(
-            r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
-            r"(?P<correspondent>.*) - "
-            r"(?P<title>.*)$",
-            flags=re.IGNORECASE
-        )),
        ("created-title", re.compile(
            r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
            r"(?P<title>.*)$",
            flags=re.IGNORECASE
        )),
-        ("correspondent-title-tags", re.compile(
-            r"(?P<correspondent>.*) - "
-            r"(?P<title>.*) - "
-            r"(?P<tags>[a-z0-9\-,]*)$",
-            flags=re.IGNORECASE
-        )),
-        ("correspondent-title", re.compile(
-            r"(?P<correspondent>.*) - "
-            r"(?P<title>.*)?$",
-            flags=re.IGNORECASE
-        )),
        ("title", re.compile(
            r"(?P<title>.*)$",
            flags=re.IGNORECASE
@@ -427,23 +385,10 @@ class FileInfo:
        except ValueError:
            return None

-    @classmethod
-    def _get_correspondent(cls, name):
-        if not name:
-            return None
-        return Correspondent.objects.get_or_create(name=name)[0]
-
    @classmethod
    def _get_title(cls, title):
        return title

-    @classmethod
-    def _get_tags(cls, tags):
-        r = []
-        for t in tags.split(","):
-            r.append(Tag.objects.get_or_create(name=t)[0])
-        return tuple(r)
-
    @classmethod
    def _mangle_property(cls, properties, name):
        if name in properties:
@@ -453,15 +398,6 @@ class FileInfo:

    @classmethod
    def from_filename(cls, filename):
-        """
-        We use a crude naming convention to make handling the correspondent,
-        title, and tags easier:
-          "<date> - <correspondent> - <title> - <tags>"
-          "<correspondent> - <title> - <tags>"
-          "<correspondent> - <title>"
-          "<title>"
-        """
-
        # Mutate filename in-place before parsing its components
        # by applying at most one of the configured transformations.
        for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS:
@@ -492,7 +428,5 @@ class FileInfo:
            if m:
                properties = m.groupdict()
                cls._mangle_property(properties, "created")
-                cls._mangle_property(properties, "correspondent")
                cls._mangle_property(properties, "title")
-                cls._mangle_property(properties, "tags")
                return cls(**properties)
--- a/src/documents/templates/index.html
+++ b/src/documents/templates/index.html
@@ -5,7 +5,7 @@
 <html lang="en">
 <head>
  <meta charset="utf-8">
-  <title>PaperlessUi</title>
+  <title>Paperless-ng</title>
  <base href="/">
  <meta name="viewport" content="width=device-width, initial-scale=1">
 	<meta name="cookie_prefix" content="{{cookie_prefix}}">
--- a/src/documents/tests/test_admin.py
+++ b/src/documents/tests/test_admin.py
@@ -0,0 +1,57 @@
+from unittest import mock
+
+from django.contrib.admin.sites import AdminSite
+from django.test import TestCase
+from django.utils import timezone
+
+from documents.admin import DocumentAdmin
+from documents.models import Document, Tag
+
+
+class TestDocumentAdmin(TestCase):
+
+    def setUp(self) -> None:
+        self.doc_admin = DocumentAdmin(model=Document, admin_site=AdminSite())
+
+    @mock.patch("documents.admin.index.add_or_update_document")
+    def test_save_model(self, m):
+        doc = Document.objects.create(title="test")
+        doc.title = "new title"
+        self.doc_admin.save_model(None, doc, None, None)
+        self.assertEqual(Document.objects.get(id=doc.id).title, "new title")
+        m.assert_called_once()
+
+    def test_tags(self):
+        doc = Document.objects.create(title="test")
+        doc.tags.create(name="t1")
+        doc.tags.create(name="t2")
+
+        self.assertEqual(self.doc_admin.tags_(doc), "<span >t1, </span><span >t2, </span>")
+
+    def test_tags_empty(self):
+        doc = Document.objects.create(title="test")
+
+        self.assertEqual(self.doc_admin.tags_(doc), "")
+
+    @mock.patch("documents.admin.index.remove_document")
+    def test_delete_model(self, m):
+        doc = Document.objects.create(title="test")
+        self.doc_admin.delete_model(None, doc)
+        self.assertRaises(Document.DoesNotExist, Document.objects.get, id=doc.id)
+        m.assert_called_once()
+
+    @mock.patch("documents.admin.index.remove_document")
+    def test_delete_queryset(self, m):
+        for i in range(42):
+            Document.objects.create(title="Many documents with the same title", checksum=f"{i:02}")
+
+        self.assertEqual(Document.objects.count(), 42)
+
+        self.doc_admin.delete_queryset(None, Document.objects.all())
+
+        self.assertEqual(m.call_count, 42)
+        self.assertEqual(Document.objects.count(), 0)
+
+    def test_created(self):
+        doc = Document.objects.create(title="test", created=timezone.datetime(2020, 4, 12))
+        self.assertEqual(self.doc_admin.created_(doc), "2020-04-12")
--- a/src/documents/tests/test_api.py
+++ b/src/documents/tests/test_api.py
@@ -352,6 +352,25 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):

        self.assertEqual(correction, None)

+    def test_search_more_like(self):
+        d1=Document.objects.create(title="invoice", content="the thing i bought at a shop and paid with bank account", checksum="A", pk=1)
+        d2=Document.objects.create(title="bank statement 1", content="things i paid for in august", pk=2, checksum="B")
+        d3=Document.objects.create(title="bank statement 3", content="things i paid for in september", pk=3, checksum="C")
+        with AsyncWriter(index.open_index()) as writer:
+            index.update_document(writer, d1)
+            index.update_document(writer, d2)
+            index.update_document(writer, d3)
+
+        response = self.client.get(f"/api/search/?more_like={d2.id}")
+
+        self.assertEqual(response.status_code, 200)
+
+        results = response.data['results']
+
+        self.assertEqual(len(results), 2)
+        self.assertEqual(results[0]['id'], d3.id)
+        self.assertEqual(results[1]['id'], d1.id)
+
    def test_statistics(self):

        doc1 = Document.objects.create(title="none1", checksum="A")
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -29,81 +29,6 @@ class TestAttributes(TestCase):

        self.assertEqual(tuple([t.name for t in file_info.tags]), tags, filename)

-    def test_guess_attributes_from_name0(self):
-        self._test_guess_attributes_from_name(
-            "Sender - Title.pdf", "Sender", "Title", ())
-
-    def test_guess_attributes_from_name1(self):
-        self._test_guess_attributes_from_name(
-            "Spaced Sender - Title.pdf", "Spaced Sender", "Title", ())
-
-    def test_guess_attributes_from_name2(self):
-        self._test_guess_attributes_from_name(
-            "Sender - Spaced Title.pdf", "Sender", "Spaced Title", ())
-
-    def test_guess_attributes_from_name3(self):
-        self._test_guess_attributes_from_name(
-            "Dashed-Sender - Title.pdf", "Dashed-Sender", "Title", ())
-
-    def test_guess_attributes_from_name4(self):
-        self._test_guess_attributes_from_name(
-            "Sender - Dashed-Title.pdf", "Sender", "Dashed-Title", ())
-
-    def test_guess_attributes_from_name5(self):
-        self._test_guess_attributes_from_name(
-            "Sender - Title - tag1,tag2,tag3.pdf",
-            "Sender",
-            "Title",
-            self.TAGS
-        )
-
-    def test_guess_attributes_from_name6(self):
-        self._test_guess_attributes_from_name(
-            "Spaced Sender - Title - tag1,tag2,tag3.pdf",
-            "Spaced Sender",
-            "Title",
-            self.TAGS
-        )
-
-    def test_guess_attributes_from_name7(self):
-        self._test_guess_attributes_from_name(
-            "Sender - Spaced Title - tag1,tag2,tag3.pdf",
-            "Sender",
-            "Spaced Title",
-            self.TAGS
-        )
-
-    def test_guess_attributes_from_name8(self):
-        self._test_guess_attributes_from_name(
-            "Dashed-Sender - Title - tag1,tag2,tag3.pdf",
-            "Dashed-Sender",
-            "Title",
-            self.TAGS
-        )
-
-    def test_guess_attributes_from_name9(self):
-        self._test_guess_attributes_from_name(
-            "Sender - Dashed-Title - tag1,tag2,tag3.pdf",
-            "Sender",
-            "Dashed-Title",
-            self.TAGS
-        )
-
-    def test_guess_attributes_from_name10(self):
-        self._test_guess_attributes_from_name(
-            "Σενδερ - Τιτλε - tag1,tag2,tag3.pdf",
-            "Σενδερ",
-            "Τιτλε",
-            self.TAGS
-        )
-
-    def test_guess_attributes_from_name_when_correspondent_empty(self):
-        self._test_guess_attributes_from_name(
-            ' - weird empty correspondent but should not break.pdf',
-            None,
-            'weird empty correspondent but should not break',
-            ()
-        )

    def test_guess_attributes_from_name_when_title_starts_with_dash(self):
        self._test_guess_attributes_from_name(
@@ -121,28 +46,6 @@ class TestAttributes(TestCase):
            ()
        )

-    def test_guess_attributes_from_name_when_title_is_empty(self):
-        self._test_guess_attributes_from_name(
-            'weird correspondent but should not break - .pdf',
-            'weird correspondent but should not break',
-            '',
-            ()
-        )
-
-    def test_case_insensitive_tag_creation(self):
-        """
-        Tags should be detected and created as lower case.
-        :return:
-        """
-
-        filename = "Title - Correspondent - tAg1,TAG2.pdf"
-        self.assertEqual(len(FileInfo.from_filename(filename).tags), 2)
-
-        path = "Title - Correspondent - tag1,tag2.pdf"
-        self.assertEqual(len(FileInfo.from_filename(filename).tags), 2)
-
-        self.assertEqual(Tag.objects.all().count(), 2)
-

 class TestFieldPermutations(TestCase):

@@ -199,69 +102,7 @@ class TestFieldPermutations(TestCase):
            filename = template.format(**spec)
            self._test_guessed_attributes(filename, **spec)

-    def test_title_and_correspondent(self):
-        template = '{correspondent} - {title}.pdf'
-        for correspondent in self.valid_correspondents:
-            for title in self.valid_titles:
-                spec = dict(correspondent=correspondent, title=title)
-                filename = template.format(**spec)
-                self._test_guessed_attributes(filename, **spec)
-
-    def test_title_and_correspondent_and_tags(self):
-        template = '{correspondent} - {title} - {tags}.pdf'
-        for correspondent in self.valid_correspondents:
-            for title in self.valid_titles:
-                for tags in self.valid_tags:
-                    spec = dict(correspondent=correspondent, title=title,
-                                tags=tags)
-                    filename = template.format(**spec)
-                    self._test_guessed_attributes(filename, **spec)
-
-    def test_created_and_correspondent_and_title_and_tags(self):
-
-        template = (
-            "{created} - "
-            "{correspondent} - "
-            "{title} - "
-            "{tags}.pdf"
-        )
-
-        for created in self.valid_dates:
-            for correspondent in self.valid_correspondents:
-                for title in self.valid_titles:
-                    for tags in self.valid_tags:
-                        spec = {
-                            "created": created,
-                            "correspondent": correspondent,
-                            "title": title,
-                            "tags": tags,
-                        }
-                        self._test_guessed_attributes(
-                            template.format(**spec), **spec)
-
-    def test_created_and_correspondent_and_title(self):
-
-        template = "{created} - {correspondent} - {title}.pdf"
-
-        for created in self.valid_dates:
-            for correspondent in self.valid_correspondents:
-                for title in self.valid_titles:
-
-                    # Skip cases where title looks like a tag as we can't
-                    # accommodate such cases.
-                    if title.lower() == title:
-                        continue
-
-                    spec = {
-                        "created": created,
-                        "correspondent": correspondent,
-                        "title": title
-                    }
-                    self._test_guessed_attributes(
-                        template.format(**spec), **spec)
-
    def test_created_and_title(self):
-
        template = "{created} - {title}.pdf"

        for created in self.valid_dates:
@@ -273,21 +114,6 @@ class TestFieldPermutations(TestCase):
                self._test_guessed_attributes(
                    template.format(**spec), **spec)

-    def test_created_and_title_and_tags(self):
-
-        template = "{created} - {title} - {tags}.pdf"
-
-        for created in self.valid_dates:
-            for title in self.valid_titles:
-                for tags in self.valid_tags:
-                    spec = {
-                        "created": created,
-                        "title": title,
-                        "tags": tags
-                    }
-                    self._test_guessed_attributes(
-                        template.format(**spec), **spec)
-
    def test_invalid_date_format(self):
        info = FileInfo.from_filename("06112017Z - title.pdf")
        self.assertEqual(info.title, "title")
@@ -336,32 +162,6 @@ class TestFieldPermutations(TestCase):
            info = FileInfo.from_filename(filename)
            self.assertEqual(info.title, "anotherall")

-        # Complex transformation without date in replacement string
-        with self.settings(
-                FILENAME_PARSE_TRANSFORMS=[(exact_patt, repl1)]):
-            info = FileInfo.from_filename(filename)
-            self.assertEqual(info.title, "0001")
-            self.assertEqual(len(info.tags), 2)
-            self.assertEqual(info.tags[0].name, "tag1")
-            self.assertEqual(info.tags[1].name, "tag2")
-            self.assertIsNone(info.created)
-
-        # Complex transformation with date in replacement string
-        with self.settings(
-            FILENAME_PARSE_TRANSFORMS=[
-                (none_patt, "none.gif"),
-                (exact_patt, repl2),    # <-- matches
-                (exact_patt, repl1),
-                (all_patt, "all.gif")]):
-            info = FileInfo.from_filename(filename)
-            self.assertEqual(info.title, "0001")
-            self.assertEqual(len(info.tags), 2)
-            self.assertEqual(info.tags[0].name, "tag1")
-            self.assertEqual(info.tags[1].name, "tag2")
-            self.assertEqual(info.created.year, 2019)
-            self.assertEqual(info.created.month, 9)
-            self.assertEqual(info.created.day, 8)
-

 class DummyParser(DocumentParser):

@@ -476,15 +276,13 @@ class TestConsumer(DirectoriesMixin, TestCase):

    def testOverrideFilename(self):
        filename = self.get_test_file()
-        override_filename = "My Bank - Statement for November.pdf"
+        override_filename = "Statement for November.pdf"

        document = self.consumer.try_consume_file(filename, override_filename=override_filename)

-        self.assertEqual(document.correspondent.name, "My Bank")
        self.assertEqual(document.title, "Statement for November")

    def testOverrideTitle(self):
-
        document = self.consumer.try_consume_file(self.get_test_file(), override_title="Override Title")
        self.assertEqual(document.title, "Override Title")

@@ -594,11 +392,10 @@ class TestConsumer(DirectoriesMixin, TestCase):
    def testFilenameHandling(self):
        filename = self.get_test_file()

-        document = self.consumer.try_consume_file(filename, override_filename="Bank - Test.pdf", override_title="new docs")
+        document = self.consumer.try_consume_file(filename, override_title="new docs")

        self.assertEqual(document.title, "new docs")
-        self.assertEqual(document.correspondent.name, "Bank")
-        self.assertEqual(document.filename, "Bank/new docs.pdf")
+        self.assertEqual(document.filename, "none/new docs.pdf")

    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
    @mock.patch("documents.signals.handlers.generate_unique_filename")
@@ -617,10 +414,9 @@ class TestConsumer(DirectoriesMixin, TestCase):

        Tag.objects.create(name="test", is_inbox_tag=True)

-        document = self.consumer.try_consume_file(filename, override_filename="Bank - Test.pdf", override_title="new docs")
+        document = self.consumer.try_consume_file(filename, override_title="new docs")

        self.assertEqual(document.title, "new docs")
-        self.assertEqual(document.correspondent.name, "Bank")
        self.assertIsNotNone(os.path.isfile(document.title))
        self.assertTrue(os.path.isfile(document.source_path))

@@ -642,3 +438,31 @@ class TestConsumer(DirectoriesMixin, TestCase):
        self.assertEqual(document.document_type, dtype)
        self.assertIn(t1, document.tags.all())
        self.assertNotIn(t2, document.tags.all())
+
+    @override_settings(CONSUMER_DELETE_DUPLICATES=True)
+    def test_delete_duplicate(self):
+        dst = self.get_test_file()
+        self.assertTrue(os.path.isfile(dst))
+        doc = self.consumer.try_consume_file(dst)
+
+        self.assertFalse(os.path.isfile(dst))
+        self.assertIsNotNone(doc)
+
+        dst = self.get_test_file()
+        self.assertTrue(os.path.isfile(dst))
+        self.assertRaises(ConsumerError, self.consumer.try_consume_file, dst)
+        self.assertFalse(os.path.isfile(dst))
+
+    @override_settings(CONSUMER_DELETE_DUPLICATES=False)
+    def test_no_delete_duplicate(self):
+        dst = self.get_test_file()
+        self.assertTrue(os.path.isfile(dst))
+        doc = self.consumer.try_consume_file(dst)
+
+        self.assertFalse(os.path.isfile(dst))
+        self.assertIsNotNone(doc)
+
+        dst = self.get_test_file()
+        self.assertTrue(os.path.isfile(dst))
+        self.assertRaises(ConsumerError, self.consumer.try_consume_file, dst)
+        self.assertTrue(os.path.isfile(dst))
--- a/src/documents/tests/test_file_handling.py
+++ b/src/documents/tests/test_file_handling.py
@@ -14,7 +14,7 @@ from django.utils import timezone
 from .utils import DirectoriesMixin
 from ..file_handling import generate_filename, create_source_path_directory, delete_empty_directories, \
    generate_unique_filename
-from ..models import Document, Correspondent, Tag
+from ..models import Document, Correspondent, Tag, DocumentType


 class TestFileHandling(DirectoriesMixin, TestCase):
@@ -190,6 +190,17 @@ class TestFileHandling(DirectoriesMixin, TestCase):
        self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), True)
        self.assertTrue(os.path.isfile(important_file))

+    @override_settings(PAPERLESS_FILENAME_FORMAT="{document_type} - {title}")
+    def test_document_type(self):
+        dt = DocumentType.objects.create(name="my_doc_type")
+        d = Document.objects.create(title="the_doc", mime_type="application/pdf")
+
+        self.assertEqual(generate_filename(d), "none - the_doc.pdf")
+
+        d.document_type = dt
+
+        self.assertEqual(generate_filename(d), "my_doc_type - the_doc.pdf")
+
    @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
    def test_tags_with_underscore(self):
        document = Document()
--- a/src/documents/tests/test_management.py
+++ b/src/documents/tests/test_management.py
@@ -0,0 +1,135 @@
+import hashlib
+import tempfile
+import filecmp
+import os
+import shutil
+from pathlib import Path
+from unittest import mock
+
+from django.test import TestCase, override_settings
+
+
+from django.core.management import call_command
+
+from documents.file_handling import generate_filename
+from documents.management.commands.document_archiver import handle_document
+from documents.models import Document
+from documents.tests.utils import DirectoriesMixin
+
+
+sample_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")
+
+
+class TestArchiver(DirectoriesMixin, TestCase):
+
+    def make_models(self):
+        return Document.objects.create(checksum="A", title="A", content="first document", mime_type="application/pdf")
+
+    def test_archiver(self):
+
+        doc = self.make_models()
+        shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf"))
+
+        call_command('document_archiver')
+
+    def test_handle_document(self):
+
+        doc = self.make_models()
+        shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf"))
+
+        handle_document(doc.pk)
+
+        doc = Document.objects.get(id=doc.id)
+
+        self.assertIsNotNone(doc.checksum)
+        self.assertTrue(os.path.isfile(doc.archive_path))
+        self.assertTrue(os.path.isfile(doc.source_path))
+        self.assertTrue(filecmp.cmp(sample_file, doc.source_path))
+
+
+class TestDecryptDocuments(TestCase):
+
+    @override_settings(
+        ORIGINALS_DIR=os.path.join(os.path.dirname(__file__), "samples", "originals"),
+        THUMBNAIL_DIR=os.path.join(os.path.dirname(__file__), "samples", "thumb"),
+        PASSPHRASE="test",
+        PAPERLESS_FILENAME_FORMAT=None
+    )
+    @mock.patch("documents.management.commands.decrypt_documents.input")
+    def test_decrypt(self, m):
+
+        media_dir = tempfile.mkdtemp()
+        originals_dir = os.path.join(media_dir, "documents", "originals")
+        thumb_dir = os.path.join(media_dir, "documents", "thumbnails")
+        os.makedirs(originals_dir, exist_ok=True)
+        os.makedirs(thumb_dir, exist_ok=True)
+
+        override_settings(
+            ORIGINALS_DIR=originals_dir,
+            THUMBNAIL_DIR=thumb_dir,
+            PASSPHRASE="test"
+        ).enable()
+
+        doc = Document.objects.create(checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg",  mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG)
+
+        shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000002.pdf.gpg"), os.path.join(originals_dir, "0000002.pdf.gpg"))
+        shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "thumbnails", f"0000002.png.gpg"), os.path.join(thumb_dir, f"{doc.id:07}.png.gpg"))
+
+        call_command('decrypt_documents')
+
+        doc.refresh_from_db()
+
+        self.assertEqual(doc.storage_type, Document.STORAGE_TYPE_UNENCRYPTED)
+        self.assertEqual(doc.filename, "0000002.pdf")
+        self.assertTrue(os.path.isfile(os.path.join(originals_dir, "0000002.pdf")))
+        self.assertTrue(os.path.isfile(doc.source_path))
+        self.assertTrue(os.path.isfile(os.path.join(thumb_dir, f"{doc.id:07}.png")))
+        self.assertTrue(os.path.isfile(doc.thumbnail_path))
+
+        with doc.source_file as f:
+            checksum = hashlib.md5(f.read()).hexdigest()
+            self.assertEqual(checksum, doc.checksum)
+
+
+class TestMakeIndex(TestCase):
+
+    @mock.patch("documents.management.commands.document_index.index_reindex")
+    def test_reindex(self, m):
+        call_command("document_index", "reindex")
+        m.assert_called_once()
+
+    @mock.patch("documents.management.commands.document_index.index_optimize")
+    def test_optimize(self, m):
+        call_command("document_index", "optimize")
+        m.assert_called_once()
+
+
+class TestRenamer(DirectoriesMixin, TestCase):
+
+    def test_rename(self):
+        doc = Document.objects.create(title="test", mime_type="application/pdf")
+        doc.filename = generate_filename(doc)
+        doc.save()
+
+        Path(doc.source_path).touch()
+
+        old_source_path = doc.source_path
+
+        with override_settings(PAPERLESS_FILENAME_FORMAT="{title}"):
+            call_command("document_renamer")
+
+        doc2 = Document.objects.get(id=doc.id)
+
+        self.assertEqual(doc2.filename, "test.pdf")
+        self.assertFalse(os.path.isfile(old_source_path))
+        self.assertFalse(os.path.isfile(doc.source_path))
+        self.assertTrue(os.path.isfile(doc2.source_path))
+
+
+class TestCreateClassifier(TestCase):
+
+    @mock.patch("documents.management.commands.document_create_classifier.train_classifier")
+    def test_create_classifier(self, m):
+        call_command("document_create_classifier")
+
+        m.assert_called_once()
--- a/src/documents/tests/test_management_archiver.py
+++ b/src/documents/tests/test_management_archiver.py
@@ -1,40 +0,0 @@
-import filecmp
-import os
-import shutil
-
-from django.core.management import call_command
-from django.test import TestCase
-
-from documents.management.commands.document_archiver import handle_document
-from documents.models import Document
-from documents.tests.utils import DirectoriesMixin
-
-
-sample_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")
-
-
-class TestArchiver(DirectoriesMixin, TestCase):
-
-    def make_models(self):
-        return Document.objects.create(checksum="A", title="A", content="first document", mime_type="application/pdf")
-
-    def test_archiver(self):
-
-        doc = self.make_models()
-        shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf"))
-
-        call_command('document_archiver')
-
-    def test_handle_document(self):
-
-        doc = self.make_models()
-        shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf"))
-
-        handle_document(doc.pk)
-
-        doc = Document.objects.get(id=doc.id)
-
-        self.assertIsNotNone(doc.checksum)
-        self.assertTrue(os.path.isfile(doc.archive_path))
-        self.assertTrue(os.path.isfile(doc.source_path))
-        self.assertTrue(filecmp.cmp(sample_file, doc.source_path))
--- a/src/documents/tests/test_management_decrypt.py
+++ b/src/documents/tests/test_management_decrypt.py
@@ -1,57 +0,0 @@
-import hashlib
-import json
-import os
-import shutil
-import tempfile
-from unittest import mock
-
-from django.core.management import call_command
-from django.test import TestCase, override_settings
-
-from documents.management.commands import document_exporter
-from documents.models import Document, Tag, DocumentType, Correspondent
-
-
-class TestDecryptDocuments(TestCase):
-
-    @override_settings(
-        ORIGINALS_DIR=os.path.join(os.path.dirname(__file__), "samples", "originals"),
-        THUMBNAIL_DIR=os.path.join(os.path.dirname(__file__), "samples", "thumb"),
-        PASSPHRASE="test",
-        PAPERLESS_FILENAME_FORMAT=None
-    )
-    @mock.patch("documents.management.commands.decrypt_documents.input")
-    def test_decrypt(self, m):
-
-        media_dir = tempfile.mkdtemp()
-        originals_dir = os.path.join(media_dir, "documents", "originals")
-        thumb_dir = os.path.join(media_dir, "documents", "thumbnails")
-        os.makedirs(originals_dir, exist_ok=True)
-        os.makedirs(thumb_dir, exist_ok=True)
-
-        override_settings(
-            ORIGINALS_DIR=originals_dir,
-            THUMBNAIL_DIR=thumb_dir,
-            PASSPHRASE="test"
-        ).enable()
-
-        doc = Document.objects.create(checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg",  mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG)
-
-        shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000002.pdf.gpg"), os.path.join(originals_dir, "0000002.pdf.gpg"))
-        shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "thumbnails", f"0000002.png.gpg"), os.path.join(thumb_dir, f"{doc.id:07}.png.gpg"))
-
-        call_command('decrypt_documents')
-
-        doc.refresh_from_db()
-
-        self.assertEqual(doc.storage_type, Document.STORAGE_TYPE_UNENCRYPTED)
-        self.assertEqual(doc.filename, "0000002.pdf")
-        self.assertTrue(os.path.isfile(os.path.join(originals_dir, "0000002.pdf")))
-        self.assertTrue(os.path.isfile(doc.source_path))
-        self.assertTrue(os.path.isfile(os.path.join(thumb_dir, f"{doc.id:07}.png")))
-        self.assertTrue(os.path.isfile(doc.thumbnail_path))
-
-        with doc.source_file as f:
-            checksum = hashlib.md5(f.read()).hexdigest()
-            self.assertEqual(checksum, doc.checksum)
-
--- a/src/documents/tests/test_migrations.py
+++ b/src/documents/tests/test_migrations.py
@@ -0,0 +1,129 @@
+import os
+import shutil
+from pathlib import Path
+
+from django.apps import apps
+from django.conf import settings
+from django.db import connection
+from django.db.migrations.executor import MigrationExecutor
+from django.test import TestCase, TransactionTestCase, override_settings
+
+from documents.models import Document
+from documents.parsers import get_default_file_extension
+from documents.tests.utils import DirectoriesMixin
+
+
+class TestMigrations(TransactionTestCase):
+
+    @property
+    def app(self):
+        return apps.get_containing_app_config(type(self).__module__).name
+
+    migrate_from = None
+    migrate_to = None
+
+    def setUp(self):
+        super(TestMigrations, self).setUp()
+
+        assert self.migrate_from and self.migrate_to, \
+            "TestCase '{}' must define migrate_from and migrate_to     properties".format(type(self).__name__)
+        self.migrate_from = [(self.app, self.migrate_from)]
+        self.migrate_to = [(self.app, self.migrate_to)]
+        executor = MigrationExecutor(connection)
+        old_apps = executor.loader.project_state(self.migrate_from).apps
+
+        # Reverse to the original migration
+        executor.migrate(self.migrate_from)
+
+        self.setUpBeforeMigration(old_apps)
+
+        # Run the migration to test
+        executor = MigrationExecutor(connection)
+        executor.loader.build_graph()  # reload.
+        executor.migrate(self.migrate_to)
+
+        self.apps = executor.loader.project_state(self.migrate_to).apps
+
+    def setUpBeforeMigration(self, apps):
+        pass
+
+
+STORAGE_TYPE_UNENCRYPTED = "unencrypted"
+STORAGE_TYPE_GPG = "gpg"
+
+
+def source_path_before(self):
+    if self.filename:
+        fname = str(self.filename)
+    else:
+        fname = "{:07}.{}".format(self.pk, self.file_type)
+        if self.storage_type == STORAGE_TYPE_GPG:
+            fname += ".gpg"
+
+    return os.path.join(
+        settings.ORIGINALS_DIR,
+        fname
+    )
+
+
+def file_type_after(self):
+    return get_default_file_extension(self.mime_type)
+
+
+def source_path_after(doc):
+    if doc.filename:
+        fname = str(doc.filename)
+    else:
+        fname = "{:07}{}".format(doc.pk, file_type_after(doc))
+        if doc.storage_type == STORAGE_TYPE_GPG:
+            fname += ".gpg"  # pragma: no cover
+
+    return os.path.join(
+        settings.ORIGINALS_DIR,
+        fname
+    )
+
+
+@override_settings(PASSPHRASE="test")
+class TestMigrateMimeType(DirectoriesMixin, TestMigrations):
+
+    migrate_from = '1002_auto_20201111_1105'
+    migrate_to = '1003_mime_types'
+
+    def setUpBeforeMigration(self, apps):
+        Document = apps.get_model("documents", "Document")
+        doc = Document.objects.create(title="test", file_type="pdf", filename="file1.pdf")
+        self.doc_id = doc.id
+        shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), source_path_before(doc))
+
+        doc2 = Document.objects.create(checksum="B", file_type="pdf", storage_type=STORAGE_TYPE_GPG)
+        self.doc2_id = doc2.id
+        shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000002.pdf.gpg"), source_path_before(doc2))
+
+    def testMimeTypesMigrated(self):
+        Document = self.apps.get_model('documents', 'Document')
+
+        doc = Document.objects.get(id=self.doc_id)
+        self.assertEqual(doc.mime_type, "application/pdf")
+
+        doc2 = Document.objects.get(id=self.doc2_id)
+        self.assertEqual(doc2.mime_type, "application/pdf")
+
+
+@override_settings(PASSPHRASE="test")
+class TestMigrateMimeTypeBackwards(DirectoriesMixin, TestMigrations):
+
+    migrate_from = '1003_mime_types'
+    migrate_to = '1002_auto_20201111_1105'
+
+    def setUpBeforeMigration(self, apps):
+        Document = apps.get_model("documents", "Document")
+        doc = Document.objects.create(title="test", mime_type="application/pdf", filename="file1.pdf")
+        self.doc_id = doc.id
+        shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), source_path_after(doc))
+
+    def testMimeTypesReverted(self):
+        Document = self.apps.get_model('documents', 'Document')
+
+        doc = Document.objects.get(id=self.doc_id)
+        self.assertEqual(doc.file_type, "pdf")
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -389,14 +389,27 @@ class SearchView(APIView):
                }

    def get(self, request, format=None):
-        if 'query' not in request.query_params:
+
+        if 'query' in request.query_params:
+            query = request.query_params['query']
+        else:
+            query = None
+
+        if 'more_like' in request.query_params:
+            more_like_id = request.query_params['more_like']
+            more_like_content = Document.objects.get(id=more_like_id).content
+        else:
+            more_like_id = None
+            more_like_content = None
+
+        if not query and not more_like_id:
            return Response({
                'count': 0,
                'page': 0,
                'page_count': 0,
+                'corrected_query': None,
                'results': []})

-        query = request.query_params['query']
        try:
            page = int(request.query_params.get('page', 1))
        except (ValueError, TypeError):
@@ -406,8 +419,7 @@ class SearchView(APIView):
            page = 1

        try:
-            with index.query_page(self.ix, query, page) as (result_page,
-                                                            corrected_query):
+            with index.query_page(self.ix, page, query, more_like_id, more_like_content) as (result_page, corrected_query):  # NOQA: E501
                return Response(
                    {'count': len(result_page),
                     'page': result_page.pagenum,
--- a/src/paperless/checks.py
+++ b/src/paperless/checks.py
@@ -13,18 +13,17 @@ writeable_hint = (
 )


-def path_check(env_var):
+def path_check(var, directory):
    messages = []
-    directory = os.getenv(env_var)
    if directory:
        if not os.path.exists(directory):
            messages.append(Error(
-                exists_message.format(env_var),
+                exists_message.format(var),
                exists_hint.format(directory)
            ))
        elif not os.access(directory, os.W_OK | os.X_OK):
            messages.append(Error(
-                writeable_message.format(env_var),
+                writeable_message.format(var),
                writeable_hint.format(directory)
            ))
    return messages
@@ -36,12 +35,9 @@ def paths_check(app_configs, **kwargs):
    Check the various paths for existence, readability and writeability
    """

-    check_messages = path_check("PAPERLESS_DATA_DIR") + \
-        path_check("PAPERLESS_MEDIA_ROOT") + \
-        path_check("PAPERLESS_CONSUMPTION_DIR") + \
-        path_check("PAPERLESS_STATICDIR")
-
-    return check_messages
+    return path_check("PAPERLESS_DATA_DIR", settings.DATA_DIR) + \
+        path_check("PAPERLESS_MEDIA_ROOT", settings.MEDIA_ROOT) + \
+        path_check("PAPERLESS_CONSUMPTION_DIR", settings.CONSUMPTION_DIR)


@register()
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -160,13 +160,6 @@ if AUTO_LOGIN_USERNAME:
    MIDDLEWARE.insert(_index+1, 'paperless.auth.AutoLoginMiddleware')


-if DEBUG:
-    X_FRAME_OPTIONS = ''
-    # this should really be 'allow-from uri' but its not supported in any mayor
-    # browser.
-else:
-    X_FRAME_OPTIONS = 'SAMEORIGIN'
-
 # We allow CORS from localhost:8080
 CORS_ALLOWED_ORIGINS = tuple(os.getenv("PAPERLESS_CORS_ALLOWED_HOSTS", "http://localhost:8000").split(","))

--- a/src/paperless/tests/test_checks.py
+++ b/src/paperless/tests/test_checks.py
@@ -0,0 +1,54 @@
+import os
+import shutil
+
+from django.test import TestCase, override_settings
+
+from documents.tests.utils import DirectoriesMixin
+from paperless import binaries_check, paths_check
+from paperless.checks import debug_mode_check
+
+
+class TestChecks(DirectoriesMixin, TestCase):
+
+    def test_binaries(self):
+        self.assertEqual(binaries_check(None), [])
+
+    @override_settings(CONVERT_BINARY="uuuhh", OPTIPNG_BINARY="forgot")
+    def test_binaries_fail(self):
+        self.assertEqual(len(binaries_check(None)), 2)
+
+    def test_paths_check(self):
+        self.assertEqual(paths_check(None), [])
+
+    @override_settings(MEDIA_ROOT="uuh",
+                       DATA_DIR="whatever",
+                       CONSUMPTION_DIR="idontcare")
+    def test_paths_check_dont_exist(self):
+        msgs = paths_check(None)
+        self.assertEqual(len(msgs), 3, str(msgs))
+
+        for msg in msgs:
+            self.assertTrue(msg.msg.endswith("is set but doesn't exist."))
+
+    def test_paths_check_no_access(self):
+        os.chmod(self.dirs.data_dir, 0o000)
+        os.chmod(self.dirs.media_dir, 0o000)
+        os.chmod(self.dirs.consumption_dir, 0o000)
+
+        self.addCleanup(os.chmod, self.dirs.data_dir, 0o777)
+        self.addCleanup(os.chmod, self.dirs.media_dir, 0o777)
+        self.addCleanup(os.chmod, self.dirs.consumption_dir, 0o777)
+
+        msgs = paths_check(None)
+        self.assertEqual(len(msgs), 3)
+
+        for msg in msgs:
+            self.assertTrue(msg.msg.endswith("is not writeable"))
+
+    @override_settings(DEBUG=False)
+    def test_debug_disabled(self):
+        self.assertEqual(debug_mode_check(None), [])
+
+    @override_settings(DEBUG=True)
+    def test_debug_enabled(self):
+        self.assertEqual(len(debug_mode_check(None)), 1)
--- a/src/paperless_tesseract/checks.py
+++ b/src/paperless_tesseract/checks.py
@@ -1,7 +1,7 @@
 import subprocess

 from django.conf import settings
-from django.core.checks import Error, register
+from django.core.checks import Error, Warning, register


 def get_tesseract_langs():
--- a/src/paperless_tesseract/languages.py
+++ b/src/paperless_tesseract/languages.py
@@ -1,194 +0,0 @@
-# Thanks to the Library of Congress and some creative use of sed and awk:
-# http://www.loc.gov/standards/iso639-2/php/English_list.php
-
-ISO639 = {
-
-    "aa": "aar",
-    "ab": "abk",
-    "ae": "ave",
-    "af": "afr",
-    "ak": "aka",
-    "am": "amh",
-    "an": "arg",
-    "ar": "ara",
-    "as": "asm",
-    "av": "ava",
-    "ay": "aym",
-    "az": "aze",
-    "ba": "bak",
-    "be": "bel",
-    "bg": "bul",
-    "bh": "bih",
-    "bi": "bis",
-    "bm": "bam",
-    "bn": "ben",
-    "bo": "bod",
-    "br": "bre",
-    "bs": "bos",
-    "ca": "cat",
-    "ce": "che",
-    "ch": "cha",
-    "co": "cos",
-    "cr": "cre",
-    "cs": "ces",
-    "cu": "chu",
-    "cv": "chv",
-    "cy": "cym",
-    "da": "dan",
-    "de": "deu",
-    "dv": "div",
-    "dz": "dzo",
-    "ee": "ewe",
-    "el": "ell",
-    "en": "eng",
-    "eo": "epo",
-    "es": "spa",
-    "et": "est",
-    "eu": "eus",
-    "fa": "fas",
-    "ff": "ful",
-    "fi": "fin",
-    "fj": "fij",
-    "fo": "fao",
-    "fr": "fra",
-    "fy": "fry",
-    "ga": "gle",
-    "gd": "gla",
-    "gl": "glg",
-    "gn": "grn",
-    "gu": "guj",
-    "gv": "glv",
-    "ha": "hau",
-    "he": "heb",
-    "hi": "hin",
-    "ho": "hmo",
-    "hr": "hrv",
-    "ht": "hat",
-    "hu": "hun",
-    "hy": "hye",
-    "hz": "her",
-    "ia": "ina",
-    "id": "ind",
-    "ie": "ile",
-    "ig": "ibo",
-    "ii": "iii",
-    "ik": "ipk",
-    "io": "ido",
-    "is": "isl",
-    "it": "ita",
-    "iu": "iku",
-    "ja": "jpn",
-    "jv": "jav",
-    "ka": "kat",
-    "kg": "kon",
-    "ki": "kik",
-    "kj": "kua",
-    "kk": "kaz",
-    "kl": "kal",
-    "km": "khm",
-    "kn": "kan",
-    "ko": "kor",
-    "kr": "kau",
-    "ks": "kas",
-    "ku": "kur",
-    "kv": "kom",
-    "kw": "cor",
-    "ky": "kir",
-    "la": "lat",
-    "lb": "ltz",
-    "lg": "lug",
-    "li": "lim",
-    "ln": "lin",
-    "lo": "lao",
-    "lt": "lit",
-    "lu": "lub",
-    "lv": "lav",
-    "mg": "mlg",
-    "mh": "mah",
-    "mi": "mri",
-    "mk": "mkd",
-    "ml": "mal",
-    "mn": "mon",
-    "mr": "mar",
-    "ms": "msa",
-    "mt": "mlt",
-    "my": "mya",
-    "na": "nau",
-    "nb": "nob",
-    "nd": "nde",
-    "ne": "nep",
-    "ng": "ndo",
-    "nl": "nld",
-    "no": "nor",
-    "nr": "nbl",
-    "nv": "nav",
-    "ny": "nya",
-    "oc": "oci",
-    "oj": "oji",
-    "om": "orm",
-    "or": "ori",
-    "os": "oss",
-    "pa": "pan",
-    "pi": "pli",
-    "pl": "pol",
-    "ps": "pus",
-    "pt": "por",
-    "qu": "que",
-    "rm": "roh",
-    "rn": "run",
-    "ro": "ron",
-    "ru": "rus",
-    "rw": "kin",
-    "sa": "san",
-    "sc": "srd",
-    "sd": "snd",
-    "se": "sme",
-    "sg": "sag",
-    "si": "sin",
-    "sk": "slk",
-    "sl": "slv",
-    "sm": "smo",
-    "sn": "sna",
-    "so": "som",
-    "sq": "sqi",
-    "sr": "srp",
-    "ss": "ssw",
-    "st": "sot",
-    "su": "sun",
-    "sv": "swe",
-    "sw": "swa",
-    "ta": "tam",
-    "te": "tel",
-    "tg": "tgk",
-    "th": "tha",
-    "ti": "tir",
-    "tk": "tuk",
-    "tl": "tgl",
-    "tn": "tsn",
-    "to": "ton",
-    "tr": "tur",
-    "ts": "tso",
-    "tt": "tat",
-    "tw": "twi",
-    "ty": "tah",
-    "ug": "uig",
-    "uk": "ukr",
-    "ur": "urd",
-    "uz": "uzb",
-    "ve": "ven",
-    "vi": "vie",
-    "vo": "vol",
-    "wa": "wln",
-    "wo": "wol",
-    "xh": "xho",
-    "yi": "yid",
-    "yo": "yor",
-    "za": "zha",
-
-    # Tessdata contains two values for Chinese, "chi_sim" and "chi_tra".  I
-    # have no idea which one is better, so I just picked the bigger file.
-    "zh": "chi_tra",
-
-    "zu": "zul"
-
-}
--- a/src/paperless_tesseract/tests/test_checks.py
+++ b/src/paperless_tesseract/tests/test_checks.py
@@ -0,0 +1,26 @@
+from unittest import mock
+
+from django.core.checks import ERROR
+from django.test import TestCase, override_settings
+
+from paperless_tesseract import check_default_language_available
+
+
+class TestChecks(TestCase):
+
+    def test_default_language(self):
+        msgs = check_default_language_available(None)
+
+    @override_settings(OCR_LANGUAGE="")
+    def test_no_language(self):
+        msgs = check_default_language_available(None)
+        self.assertEqual(len(msgs), 1)
+        self.assertTrue(msgs[0].msg.startswith("No OCR language has been specified with PAPERLESS_OCR_LANGUAGE"))
+
+    @override_settings(OCR_LANGUAGE="ita")
+    @mock.patch("paperless_tesseract.checks.get_tesseract_langs")
+    def test_invalid_language(self, m):
+        m.return_value = ["deu", "eng"]
+        msgs = check_default_language_available(None)
+        self.assertEqual(len(msgs), 1)
+        self.assertEqual(msgs[0].level, ERROR)
--- a/src/paperless_text/parsers.py
+++ b/src/paperless_text/parsers.py
@@ -35,15 +35,3 @@ class TextDocumentParser(DocumentParser):
    def parse(self, document_path, mime_type):
        with open(document_path, 'r') as f:
            self.text = f.read()
-
-
-def run_command(*args):
-    environment = os.environ.copy()
-    if settings.CONVERT_MEMORY_LIMIT:
-        environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
-    if settings.CONVERT_TMPDIR:
-        environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
-
-    if not subprocess.Popen(' '.join(args), env=environment,
-                            shell=True).wait() == 0:
-        raise ParseError("Convert failed at {}".format(args))