![]()
diff --git a/src-ui/src/app/components/document-list/document-list.component.html b/src-ui/src/app/components/document-list/document-list.component.html
index 699f13840..5ce4efd2a 100644
--- a/src-ui/src/app/components/document-list/document-list.component.html
+++ b/src-ui/src/app/components/document-list/document-list.component.html
@@ -151,5 +151,5 @@
diff --git a/src-ui/src/app/components/search/result-highlight/result-highlight.component.html b/src-ui/src/app/components/search/result-highlight/result-highlight.component.html
index 1842f5cea..5dc5baa94 100644
--- a/src-ui/src/app/components/search/result-highlight/result-highlight.component.html
+++ b/src-ui/src/app/components/search/result-highlight/result-highlight.component.html
@@ -1,3 +1,3 @@
...
- {{token.text}} ...
+ {{token.text}} ...
\ No newline at end of file
diff --git a/src-ui/src/app/components/search/result-highlight/result-highlight.component.scss b/src-ui/src/app/components/search/result-highlight/result-highlight.component.scss
index 645fb0426..e04dd13b2 100644
--- a/src-ui/src/app/components/search/result-highlight/result-highlight.component.scss
+++ b/src-ui/src/app/components/search/result-highlight/result-highlight.component.scss
@@ -1,4 +1,4 @@
.match {
color: black;
- background-color: orange;
+ background-color: rgb(255, 211, 66);
}
\ No newline at end of file
diff --git a/src-ui/src/app/components/search/search.component.html b/src-ui/src/app/components/search/search.component.html
index 55fcee900..de6f0133f 100644
--- a/src-ui/src/app/components/search/search.component.html
+++ b/src-ui/src/app/components/search/search.component.html
@@ -3,7 +3,12 @@
Invalid search query: {{errorMessage}}
-
+
+ Showing documents similar to
+ {{more_like_doc?.original_file_name}}
+
+
+
Search string: {{query}}
- Did you mean "{{correctedQuery}}"?
@@ -15,7 +20,9 @@
{{resultCount}} result(s)
+ [details]="result.highlights"
+ [searchScore]="result.score / maxScore"
+ [moreLikeThis]="true">
diff --git a/src-ui/src/app/components/search/search.component.ts b/src-ui/src/app/components/search/search.component.ts
index de8b4652f..4570ac3fa 100644
--- a/src-ui/src/app/components/search/search.component.ts
+++ b/src-ui/src/app/components/search/search.component.ts
@@ -1,6 +1,9 @@
import { Component, OnInit } from '@angular/core';
import { ActivatedRoute, Router } from '@angular/router';
+import { PaperlessDocument } from 'src/app/data/paperless-document';
+import { PaperlessDocumentType } from 'src/app/data/paperless-document-type';
import { SearchHit } from 'src/app/data/search-result';
+import { DocumentService } from 'src/app/services/rest/document.service';
import { SearchService } from 'src/app/services/rest/search.service';
@Component({
@@ -14,6 +17,10 @@ export class SearchComponent implements OnInit {
query: string = ""
+ more_like: number
+
+ more_like_doc: PaperlessDocument
+
searching = false
currentPage = 1
@@ -26,11 +33,24 @@ export class SearchComponent implements OnInit {
errorMessage: string
- constructor(private searchService: SearchService, private route: ActivatedRoute, private router: Router) { }
+ get maxScore() {
+ return this.results?.length > 0 ? this.results[0].score : 100
+ }
+
+ constructor(private searchService: SearchService, private route: ActivatedRoute, private router: Router, private documentService: DocumentService) { }
ngOnInit(): void {
this.route.queryParamMap.subscribe(paramMap => {
+ window.scrollTo(0, 0)
this.query = paramMap.get('query')
+ this.more_like = paramMap.has('more_like') ? +paramMap.get('more_like') : null
+ if (this.more_like) {
+ this.documentService.get(this.more_like).subscribe(r => {
+ this.more_like_doc = r
+ })
+ } else {
+ this.more_like_doc = null
+ }
this.searching = true
this.currentPage = 1
this.loadPage()
@@ -39,13 +59,14 @@ export class SearchComponent implements OnInit {
}
searchCorrectedQuery() {
- this.router.navigate(["search"], {queryParams: {query: this.correctedQuery}})
+ this.router.navigate(["search"], {queryParams: {query: this.correctedQuery, more_like: this.more_like}})
}
loadPage(append: boolean = false) {
this.errorMessage = null
this.correctedQuery = null
- this.searchService.search(this.query, this.currentPage).subscribe(result => {
+
+ this.searchService.search(this.query, this.currentPage, this.more_like).subscribe(result => {
if (append) {
this.results.push(...result.results)
} else {
diff --git a/src-ui/src/app/services/rest/search.service.ts b/src-ui/src/app/services/rest/search.service.ts
index b19a55769..3799f3dc7 100644
--- a/src-ui/src/app/services/rest/search.service.ts
+++ b/src-ui/src/app/services/rest/search.service.ts
@@ -15,11 +15,17 @@ export class SearchService {
constructor(private http: HttpClient, private documentService: DocumentService) { }
- search(query: string, page?: number): Observable
{
- let httpParams = new HttpParams().set('query', query)
+ search(query: string, page?: number, more_like?: number): Observable {
+ let httpParams = new HttpParams()
+ if (query) {
+ httpParams = httpParams.set('query', query)
+ }
if (page) {
httpParams = httpParams.set('page', page.toString())
}
+ if (more_like) {
+ httpParams = httpParams.set('more_like', more_like.toString())
+ }
return this.http.get(`${environment.apiBaseUrl}search/`, {params: httpParams}).pipe(
map(result => {
result.results.forEach(hit => this.documentService.addObservablesToDocument(hit.document))
diff --git a/src-ui/src/environments/environment.ts b/src-ui/src/environments/environment.ts
index 5e4b148dc..29a8f3af6 100644
--- a/src-ui/src/environments/environment.ts
+++ b/src-ui/src/environments/environment.ts
@@ -5,7 +5,8 @@
export const environment = {
production: false,
apiBaseUrl: "http://localhost:8000/api/",
- appTitle: "DEVELOPMENT P-NG"
+ appTitle: "Paperless-ng",
+ version: "DEVELOPMENT"
};
/*
diff --git a/src/documents/consumer.py b/src/documents/consumer.py
index e4da51f1d..ab4912a36 100755
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -247,7 +247,6 @@ class Consumer(LoggingMixin):
with open(self.path, "rb") as f:
document = Document.objects.create(
- correspondent=file_info.correspondent,
title=(self.override_title or file_info.title)[:127],
content=text,
mime_type=mime_type,
@@ -257,12 +256,6 @@ class Consumer(LoggingMixin):
storage_type=storage_type
)
- relevant_tags = set(file_info.tags)
- if relevant_tags:
- tag_names = ", ".join([t.name for t in relevant_tags])
- self.log("debug", "Tagging with {}".format(tag_names))
- document.tags.add(*relevant_tags)
-
self.apply_overrides(document)
document.save()
diff --git a/src/documents/index.py b/src/documents/index.py
index 53bf34542..308ee932e 100644
--- a/src/documents/index.py
+++ b/src/documents/index.py
@@ -3,7 +3,7 @@ import os
from contextlib import contextmanager
from django.conf import settings
-from whoosh import highlight
+from whoosh import highlight, classify, query
from whoosh.fields import Schema, TEXT, NUMERIC, KEYWORD, DATETIME
from whoosh.highlight import Formatter, get_text
from whoosh.index import create_in, exists_in, open_dir
@@ -20,32 +20,37 @@ class JsonFormatter(Formatter):
self.seen = {}
def format_token(self, text, token, replace=False):
- seen = self.seen
ttext = self._text(get_text(text, token, replace))
- if ttext in seen:
- termnum = seen[ttext]
- else:
- termnum = len(seen)
- seen[ttext] = termnum
-
- return {'text': ttext, 'term': termnum}
+ return {'text': ttext, 'highlight': 'true'}
def format_fragment(self, fragment, replace=False):
output = []
index = fragment.startchar
text = fragment.text
-
+ amend_token = None
for t in fragment.matches:
if t.startchar is None:
continue
if t.startchar < index:
continue
if t.startchar > index:
- output.append({'text': text[index:t.startchar]})
- output.append(self.format_token(text, t, replace))
+ text_inbetween = text[index:t.startchar]
+ if amend_token and t.startchar - index < 10:
+ amend_token['text'] += text_inbetween
+ else:
+ output.append({'text': text_inbetween,
+ 'highlight': False})
+ amend_token = None
+ token = self.format_token(text, t, replace)
+ if amend_token:
+ amend_token['text'] += token['text']
+ else:
+ output.append(token)
+ amend_token = token
index = t.endchar
if index < fragment.endchar:
- output.append({'text': text[index:fragment.endchar]})
+ output.append({'text': text[index:fragment.endchar],
+ 'highlight': False})
return output
def format(self, fragments, replace=False):
@@ -120,22 +125,42 @@ def remove_document_from_index(document):
@contextmanager
-def query_page(ix, querystring, page):
+def query_page(ix, page, querystring, more_like_doc_id, more_like_doc_content):
searcher = ix.searcher()
try:
- qp = MultifieldParser(
- ["content", "title", "correspondent", "tag", "type"],
- ix.schema)
- qp.add_plugin(DateParserPlugin())
+ if querystring:
+ qp = MultifieldParser(
+ ["content", "title", "correspondent", "tag", "type"],
+ ix.schema)
+ qp.add_plugin(DateParserPlugin())
+ str_q = qp.parse(querystring)
+ corrected = searcher.correct_query(str_q, querystring)
+ else:
+ str_q = None
+ corrected = None
+
+ if more_like_doc_id:
+ docnum = searcher.document_number(id=more_like_doc_id)
+ kts = searcher.key_terms_from_text(
+ 'content', more_like_doc_content, numterms=20,
+ model=classify.Bo1Model, normalize=False)
+ more_like_q = query.Or(
+ [query.Term('content', word, boost=weight)
+ for word, weight in kts])
+ result_page = searcher.search_page(
+ more_like_q, page, filter=str_q, mask={docnum})
+ elif str_q:
+ result_page = searcher.search_page(str_q, page)
+ else:
+ raise ValueError(
+ "Either querystring or more_like_doc_id is required."
+ )
- q = qp.parse(querystring)
- result_page = searcher.search_page(q, page)
result_page.results.fragmenter = highlight.ContextFragmenter(
surround=50)
result_page.results.formatter = JsonFormatter()
- corrected = searcher.correct_query(q, querystring)
- if corrected.query != q:
+ if corrected and corrected.query != str_q:
corrected_query = corrected.string
else:
corrected_query = None
diff --git a/src/documents/migrations/1003_mime_types.py b/src/documents/migrations/1003_mime_types.py
index 78ecced2b..c196f29f4 100644
--- a/src/documents/migrations/1003_mime_types.py
+++ b/src/documents/migrations/1003_mime_types.py
@@ -11,6 +11,7 @@ from paperless.db import GnuPG
STORAGE_TYPE_UNENCRYPTED = "unencrypted"
STORAGE_TYPE_GPG = "gpg"
+
def source_path(self):
if self.filename:
fname = str(self.filename)
diff --git a/src/documents/models.py b/src/documents/models.py
index 3a6d155ed..168dd8c7b 100755
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -357,54 +357,12 @@ class SavedViewFilterRule(models.Model):
# TODO: why is this in the models file?
class FileInfo:
- # This epic regex *almost* worked for our needs, so I'm keeping it here for
- # posterity, in the hopes that we might find a way to make it work one day.
- ALMOST_REGEX = re.compile(
- r"^((?P\d\d\d\d\d\d\d\d\d\d\d\d\d\dZ){separator})?"
- r"((?P{non_separated_word}+){separator})??"
- r"(?P{non_separated_word}+)"
- r"({separator}(?P[a-z,0-9-]+))?"
- r"\.(?P[a-zA-Z.-]+)$".format(
- separator=r"\s+-\s+",
- non_separated_word=r"([\w,. ]|([^\s]-))"
- )
- )
REGEXES = OrderedDict([
- ("created-correspondent-title-tags", re.compile(
- r"^(?P\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
- r"(?P.*) - "
- r"(?P.*) - "
- r"(?P[a-z0-9\-,]*)$",
- flags=re.IGNORECASE
- )),
- ("created-title-tags", re.compile(
- r"^(?P\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
- r"(?P.*) - "
- r"(?P[a-z0-9\-,]*)$",
- flags=re.IGNORECASE
- )),
- ("created-correspondent-title", re.compile(
- r"^(?P\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
- r"(?P.*) - "
- r"(?P.*)$",
- flags=re.IGNORECASE
- )),
("created-title", re.compile(
r"^(?P\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P.*)$",
flags=re.IGNORECASE
)),
- ("correspondent-title-tags", re.compile(
- r"(?P.*) - "
- r"(?P.*) - "
- r"(?P[a-z0-9\-,]*)$",
- flags=re.IGNORECASE
- )),
- ("correspondent-title", re.compile(
- r"(?P.*) - "
- r"(?P.*)?$",
- flags=re.IGNORECASE
- )),
("title", re.compile(
r"(?P.*)$",
flags=re.IGNORECASE
@@ -427,23 +385,10 @@ class FileInfo:
except ValueError:
return None
- @classmethod
- def _get_correspondent(cls, name):
- if not name:
- return None
- return Correspondent.objects.get_or_create(name=name)[0]
-
@classmethod
def _get_title(cls, title):
return title
- @classmethod
- def _get_tags(cls, tags):
- r = []
- for t in tags.split(","):
- r.append(Tag.objects.get_or_create(name=t)[0])
- return tuple(r)
-
@classmethod
def _mangle_property(cls, properties, name):
if name in properties:
@@ -453,15 +398,6 @@ class FileInfo:
@classmethod
def from_filename(cls, filename):
- """
- We use a crude naming convention to make handling the correspondent,
- title, and tags easier:
- " - - - "
- " - - "
- " - "
- ""
- """
-
# Mutate filename in-place before parsing its components
# by applying at most one of the configured transformations.
for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS:
@@ -492,7 +428,5 @@ class FileInfo:
if m:
properties = m.groupdict()
cls._mangle_property(properties, "created")
- cls._mangle_property(properties, "correspondent")
cls._mangle_property(properties, "title")
- cls._mangle_property(properties, "tags")
return cls(**properties)
diff --git a/src/documents/templates/index.html b/src/documents/templates/index.html
index 06dbb678e..d086be0fe 100644
--- a/src/documents/templates/index.html
+++ b/src/documents/templates/index.html
@@ -5,7 +5,7 @@
- PaperlessUi
+ Paperless-ng
diff --git a/src/documents/tests/test_admin.py b/src/documents/tests/test_admin.py
new file mode 100644
index 000000000..b280c43ea
--- /dev/null
+++ b/src/documents/tests/test_admin.py
@@ -0,0 +1,57 @@
+from unittest import mock
+
+from django.contrib.admin.sites import AdminSite
+from django.test import TestCase
+from django.utils import timezone
+
+from documents.admin import DocumentAdmin
+from documents.models import Document, Tag
+
+
+class TestDocumentAdmin(TestCase):
+
+ def setUp(self) -> None:
+ self.doc_admin = DocumentAdmin(model=Document, admin_site=AdminSite())
+
+ @mock.patch("documents.admin.index.add_or_update_document")
+ def test_save_model(self, m):
+ doc = Document.objects.create(title="test")
+ doc.title = "new title"
+ self.doc_admin.save_model(None, doc, None, None)
+ self.assertEqual(Document.objects.get(id=doc.id).title, "new title")
+ m.assert_called_once()
+
+ def test_tags(self):
+ doc = Document.objects.create(title="test")
+ doc.tags.create(name="t1")
+ doc.tags.create(name="t2")
+
+ self.assertEqual(self.doc_admin.tags_(doc), "t1, t2, ")
+
+ def test_tags_empty(self):
+ doc = Document.objects.create(title="test")
+
+ self.assertEqual(self.doc_admin.tags_(doc), "")
+
+ @mock.patch("documents.admin.index.remove_document")
+ def test_delete_model(self, m):
+ doc = Document.objects.create(title="test")
+ self.doc_admin.delete_model(None, doc)
+ self.assertRaises(Document.DoesNotExist, Document.objects.get, id=doc.id)
+ m.assert_called_once()
+
+ @mock.patch("documents.admin.index.remove_document")
+ def test_delete_queryset(self, m):
+ for i in range(42):
+ Document.objects.create(title="Many documents with the same title", checksum=f"{i:02}")
+
+ self.assertEqual(Document.objects.count(), 42)
+
+ self.doc_admin.delete_queryset(None, Document.objects.all())
+
+ self.assertEqual(m.call_count, 42)
+ self.assertEqual(Document.objects.count(), 0)
+
+ def test_created(self):
+ doc = Document.objects.create(title="test", created=timezone.datetime(2020, 4, 12))
+ self.assertEqual(self.doc_admin.created_(doc), "2020-04-12")
diff --git a/src/documents/tests/test_api.py b/src/documents/tests/test_api.py
index 43da9c058..f4f787565 100644
--- a/src/documents/tests/test_api.py
+++ b/src/documents/tests/test_api.py
@@ -352,6 +352,25 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
self.assertEqual(correction, None)
+ def test_search_more_like(self):
+ d1=Document.objects.create(title="invoice", content="the thing i bought at a shop and paid with bank account", checksum="A", pk=1)
+ d2=Document.objects.create(title="bank statement 1", content="things i paid for in august", pk=2, checksum="B")
+ d3=Document.objects.create(title="bank statement 3", content="things i paid for in september", pk=3, checksum="C")
+ with AsyncWriter(index.open_index()) as writer:
+ index.update_document(writer, d1)
+ index.update_document(writer, d2)
+ index.update_document(writer, d3)
+
+ response = self.client.get(f"/api/search/?more_like={d2.id}")
+
+ self.assertEqual(response.status_code, 200)
+
+ results = response.data['results']
+
+ self.assertEqual(len(results), 2)
+ self.assertEqual(results[0]['id'], d3.id)
+ self.assertEqual(results[1]['id'], d1.id)
+
def test_statistics(self):
doc1 = Document.objects.create(title="none1", checksum="A")
diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py
index b4b19be4c..f53981850 100644
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -29,81 +29,6 @@ class TestAttributes(TestCase):
self.assertEqual(tuple([t.name for t in file_info.tags]), tags, filename)
- def test_guess_attributes_from_name0(self):
- self._test_guess_attributes_from_name(
- "Sender - Title.pdf", "Sender", "Title", ())
-
- def test_guess_attributes_from_name1(self):
- self._test_guess_attributes_from_name(
- "Spaced Sender - Title.pdf", "Spaced Sender", "Title", ())
-
- def test_guess_attributes_from_name2(self):
- self._test_guess_attributes_from_name(
- "Sender - Spaced Title.pdf", "Sender", "Spaced Title", ())
-
- def test_guess_attributes_from_name3(self):
- self._test_guess_attributes_from_name(
- "Dashed-Sender - Title.pdf", "Dashed-Sender", "Title", ())
-
- def test_guess_attributes_from_name4(self):
- self._test_guess_attributes_from_name(
- "Sender - Dashed-Title.pdf", "Sender", "Dashed-Title", ())
-
- def test_guess_attributes_from_name5(self):
- self._test_guess_attributes_from_name(
- "Sender - Title - tag1,tag2,tag3.pdf",
- "Sender",
- "Title",
- self.TAGS
- )
-
- def test_guess_attributes_from_name6(self):
- self._test_guess_attributes_from_name(
- "Spaced Sender - Title - tag1,tag2,tag3.pdf",
- "Spaced Sender",
- "Title",
- self.TAGS
- )
-
- def test_guess_attributes_from_name7(self):
- self._test_guess_attributes_from_name(
- "Sender - Spaced Title - tag1,tag2,tag3.pdf",
- "Sender",
- "Spaced Title",
- self.TAGS
- )
-
- def test_guess_attributes_from_name8(self):
- self._test_guess_attributes_from_name(
- "Dashed-Sender - Title - tag1,tag2,tag3.pdf",
- "Dashed-Sender",
- "Title",
- self.TAGS
- )
-
- def test_guess_attributes_from_name9(self):
- self._test_guess_attributes_from_name(
- "Sender - Dashed-Title - tag1,tag2,tag3.pdf",
- "Sender",
- "Dashed-Title",
- self.TAGS
- )
-
- def test_guess_attributes_from_name10(self):
- self._test_guess_attributes_from_name(
- "Σενδερ - Τιτλε - tag1,tag2,tag3.pdf",
- "Σενδερ",
- "Τιτλε",
- self.TAGS
- )
-
- def test_guess_attributes_from_name_when_correspondent_empty(self):
- self._test_guess_attributes_from_name(
- ' - weird empty correspondent but should not break.pdf',
- None,
- 'weird empty correspondent but should not break',
- ()
- )
def test_guess_attributes_from_name_when_title_starts_with_dash(self):
self._test_guess_attributes_from_name(
@@ -121,28 +46,6 @@ class TestAttributes(TestCase):
()
)
- def test_guess_attributes_from_name_when_title_is_empty(self):
- self._test_guess_attributes_from_name(
- 'weird correspondent but should not break - .pdf',
- 'weird correspondent but should not break',
- '',
- ()
- )
-
- def test_case_insensitive_tag_creation(self):
- """
- Tags should be detected and created as lower case.
- :return:
- """
-
- filename = "Title - Correspondent - tAg1,TAG2.pdf"
- self.assertEqual(len(FileInfo.from_filename(filename).tags), 2)
-
- path = "Title - Correspondent - tag1,tag2.pdf"
- self.assertEqual(len(FileInfo.from_filename(filename).tags), 2)
-
- self.assertEqual(Tag.objects.all().count(), 2)
-
class TestFieldPermutations(TestCase):
@@ -199,69 +102,7 @@ class TestFieldPermutations(TestCase):
filename = template.format(**spec)
self._test_guessed_attributes(filename, **spec)
- def test_title_and_correspondent(self):
- template = '{correspondent} - {title}.pdf'
- for correspondent in self.valid_correspondents:
- for title in self.valid_titles:
- spec = dict(correspondent=correspondent, title=title)
- filename = template.format(**spec)
- self._test_guessed_attributes(filename, **spec)
-
- def test_title_and_correspondent_and_tags(self):
- template = '{correspondent} - {title} - {tags}.pdf'
- for correspondent in self.valid_correspondents:
- for title in self.valid_titles:
- for tags in self.valid_tags:
- spec = dict(correspondent=correspondent, title=title,
- tags=tags)
- filename = template.format(**spec)
- self._test_guessed_attributes(filename, **spec)
-
- def test_created_and_correspondent_and_title_and_tags(self):
-
- template = (
- "{created} - "
- "{correspondent} - "
- "{title} - "
- "{tags}.pdf"
- )
-
- for created in self.valid_dates:
- for correspondent in self.valid_correspondents:
- for title in self.valid_titles:
- for tags in self.valid_tags:
- spec = {
- "created": created,
- "correspondent": correspondent,
- "title": title,
- "tags": tags,
- }
- self._test_guessed_attributes(
- template.format(**spec), **spec)
-
- def test_created_and_correspondent_and_title(self):
-
- template = "{created} - {correspondent} - {title}.pdf"
-
- for created in self.valid_dates:
- for correspondent in self.valid_correspondents:
- for title in self.valid_titles:
-
- # Skip cases where title looks like a tag as we can't
- # accommodate such cases.
- if title.lower() == title:
- continue
-
- spec = {
- "created": created,
- "correspondent": correspondent,
- "title": title
- }
- self._test_guessed_attributes(
- template.format(**spec), **spec)
-
def test_created_and_title(self):
-
template = "{created} - {title}.pdf"
for created in self.valid_dates:
@@ -273,21 +114,6 @@ class TestFieldPermutations(TestCase):
self._test_guessed_attributes(
template.format(**spec), **spec)
- def test_created_and_title_and_tags(self):
-
- template = "{created} - {title} - {tags}.pdf"
-
- for created in self.valid_dates:
- for title in self.valid_titles:
- for tags in self.valid_tags:
- spec = {
- "created": created,
- "title": title,
- "tags": tags
- }
- self._test_guessed_attributes(
- template.format(**spec), **spec)
-
def test_invalid_date_format(self):
info = FileInfo.from_filename("06112017Z - title.pdf")
self.assertEqual(info.title, "title")
@@ -336,32 +162,6 @@ class TestFieldPermutations(TestCase):
info = FileInfo.from_filename(filename)
self.assertEqual(info.title, "anotherall")
- # Complex transformation without date in replacement string
- with self.settings(
- FILENAME_PARSE_TRANSFORMS=[(exact_patt, repl1)]):
- info = FileInfo.from_filename(filename)
- self.assertEqual(info.title, "0001")
- self.assertEqual(len(info.tags), 2)
- self.assertEqual(info.tags[0].name, "tag1")
- self.assertEqual(info.tags[1].name, "tag2")
- self.assertIsNone(info.created)
-
- # Complex transformation with date in replacement string
- with self.settings(
- FILENAME_PARSE_TRANSFORMS=[
- (none_patt, "none.gif"),
- (exact_patt, repl2), # <-- matches
- (exact_patt, repl1),
- (all_patt, "all.gif")]):
- info = FileInfo.from_filename(filename)
- self.assertEqual(info.title, "0001")
- self.assertEqual(len(info.tags), 2)
- self.assertEqual(info.tags[0].name, "tag1")
- self.assertEqual(info.tags[1].name, "tag2")
- self.assertEqual(info.created.year, 2019)
- self.assertEqual(info.created.month, 9)
- self.assertEqual(info.created.day, 8)
-
class DummyParser(DocumentParser):
@@ -476,15 +276,13 @@ class TestConsumer(DirectoriesMixin, TestCase):
def testOverrideFilename(self):
filename = self.get_test_file()
- override_filename = "My Bank - Statement for November.pdf"
+ override_filename = "Statement for November.pdf"
document = self.consumer.try_consume_file(filename, override_filename=override_filename)
- self.assertEqual(document.correspondent.name, "My Bank")
self.assertEqual(document.title, "Statement for November")
def testOverrideTitle(self):
-
document = self.consumer.try_consume_file(self.get_test_file(), override_title="Override Title")
self.assertEqual(document.title, "Override Title")
@@ -594,11 +392,10 @@ class TestConsumer(DirectoriesMixin, TestCase):
def testFilenameHandling(self):
filename = self.get_test_file()
- document = self.consumer.try_consume_file(filename, override_filename="Bank - Test.pdf", override_title="new docs")
+ document = self.consumer.try_consume_file(filename, override_title="new docs")
self.assertEqual(document.title, "new docs")
- self.assertEqual(document.correspondent.name, "Bank")
- self.assertEqual(document.filename, "Bank/new docs.pdf")
+ self.assertEqual(document.filename, "none/new docs.pdf")
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
@mock.patch("documents.signals.handlers.generate_unique_filename")
@@ -617,10 +414,9 @@ class TestConsumer(DirectoriesMixin, TestCase):
Tag.objects.create(name="test", is_inbox_tag=True)
- document = self.consumer.try_consume_file(filename, override_filename="Bank - Test.pdf", override_title="new docs")
+ document = self.consumer.try_consume_file(filename, override_title="new docs")
self.assertEqual(document.title, "new docs")
- self.assertEqual(document.correspondent.name, "Bank")
self.assertIsNotNone(os.path.isfile(document.title))
self.assertTrue(os.path.isfile(document.source_path))
@@ -642,3 +438,31 @@ class TestConsumer(DirectoriesMixin, TestCase):
self.assertEqual(document.document_type, dtype)
self.assertIn(t1, document.tags.all())
self.assertNotIn(t2, document.tags.all())
+
+ @override_settings(CONSUMER_DELETE_DUPLICATES=True)
+ def test_delete_duplicate(self):
+ dst = self.get_test_file()
+ self.assertTrue(os.path.isfile(dst))
+ doc = self.consumer.try_consume_file(dst)
+
+ self.assertFalse(os.path.isfile(dst))
+ self.assertIsNotNone(doc)
+
+ dst = self.get_test_file()
+ self.assertTrue(os.path.isfile(dst))
+ self.assertRaises(ConsumerError, self.consumer.try_consume_file, dst)
+ self.assertFalse(os.path.isfile(dst))
+
+ @override_settings(CONSUMER_DELETE_DUPLICATES=False)
+ def test_no_delete_duplicate(self):
+ dst = self.get_test_file()
+ self.assertTrue(os.path.isfile(dst))
+ doc = self.consumer.try_consume_file(dst)
+
+ self.assertFalse(os.path.isfile(dst))
+ self.assertIsNotNone(doc)
+
+ dst = self.get_test_file()
+ self.assertTrue(os.path.isfile(dst))
+ self.assertRaises(ConsumerError, self.consumer.try_consume_file, dst)
+ self.assertTrue(os.path.isfile(dst))
diff --git a/src/documents/tests/test_file_handling.py b/src/documents/tests/test_file_handling.py
index 2e60065f1..b24f52aa2 100644
--- a/src/documents/tests/test_file_handling.py
+++ b/src/documents/tests/test_file_handling.py
@@ -14,7 +14,7 @@ from django.utils import timezone
from .utils import DirectoriesMixin
from ..file_handling import generate_filename, create_source_path_directory, delete_empty_directories, \
generate_unique_filename
-from ..models import Document, Correspondent, Tag
+from ..models import Document, Correspondent, Tag, DocumentType
class TestFileHandling(DirectoriesMixin, TestCase):
@@ -190,6 +190,17 @@ class TestFileHandling(DirectoriesMixin, TestCase):
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), True)
self.assertTrue(os.path.isfile(important_file))
+ @override_settings(PAPERLESS_FILENAME_FORMAT="{document_type} - {title}")
+ def test_document_type(self):
+ dt = DocumentType.objects.create(name="my_doc_type")
+ d = Document.objects.create(title="the_doc", mime_type="application/pdf")
+
+ self.assertEqual(generate_filename(d), "none - the_doc.pdf")
+
+ d.document_type = dt
+
+ self.assertEqual(generate_filename(d), "my_doc_type - the_doc.pdf")
+
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
def test_tags_with_underscore(self):
document = Document()
diff --git a/src/documents/tests/test_management.py b/src/documents/tests/test_management.py
new file mode 100644
index 000000000..58aaf9342
--- /dev/null
+++ b/src/documents/tests/test_management.py
@@ -0,0 +1,135 @@
+import hashlib
+import tempfile
+import filecmp
+import os
+import shutil
+from pathlib import Path
+from unittest import mock
+
+from django.test import TestCase, override_settings
+
+
+from django.core.management import call_command
+
+from documents.file_handling import generate_filename
+from documents.management.commands.document_archiver import handle_document
+from documents.models import Document
+from documents.tests.utils import DirectoriesMixin
+
+
+sample_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")
+
+
+class TestArchiver(DirectoriesMixin, TestCase):
+
+ def make_models(self):
+ return Document.objects.create(checksum="A", title="A", content="first document", mime_type="application/pdf")
+
+ def test_archiver(self):
+
+ doc = self.make_models()
+ shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf"))
+
+ call_command('document_archiver')
+
+ def test_handle_document(self):
+
+ doc = self.make_models()
+ shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf"))
+
+ handle_document(doc.pk)
+
+ doc = Document.objects.get(id=doc.id)
+
+ self.assertIsNotNone(doc.checksum)
+ self.assertTrue(os.path.isfile(doc.archive_path))
+ self.assertTrue(os.path.isfile(doc.source_path))
+ self.assertTrue(filecmp.cmp(sample_file, doc.source_path))
+
+
+class TestDecryptDocuments(TestCase):
+
+ @override_settings(
+ ORIGINALS_DIR=os.path.join(os.path.dirname(__file__), "samples", "originals"),
+ THUMBNAIL_DIR=os.path.join(os.path.dirname(__file__), "samples", "thumb"),
+ PASSPHRASE="test",
+ PAPERLESS_FILENAME_FORMAT=None
+ )
+ @mock.patch("documents.management.commands.decrypt_documents.input")
+ def test_decrypt(self, m):
+
+ media_dir = tempfile.mkdtemp()
+ originals_dir = os.path.join(media_dir, "documents", "originals")
+ thumb_dir = os.path.join(media_dir, "documents", "thumbnails")
+ os.makedirs(originals_dir, exist_ok=True)
+ os.makedirs(thumb_dir, exist_ok=True)
+
+ override_settings(
+ ORIGINALS_DIR=originals_dir,
+ THUMBNAIL_DIR=thumb_dir,
+ PASSPHRASE="test"
+ ).enable()
+
+ doc = Document.objects.create(checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG)
+
+ shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000002.pdf.gpg"), os.path.join(originals_dir, "0000002.pdf.gpg"))
+ shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "thumbnails", f"0000002.png.gpg"), os.path.join(thumb_dir, f"{doc.id:07}.png.gpg"))
+
+ call_command('decrypt_documents')
+
+ doc.refresh_from_db()
+
+ self.assertEqual(doc.storage_type, Document.STORAGE_TYPE_UNENCRYPTED)
+ self.assertEqual(doc.filename, "0000002.pdf")
+ self.assertTrue(os.path.isfile(os.path.join(originals_dir, "0000002.pdf")))
+ self.assertTrue(os.path.isfile(doc.source_path))
+ self.assertTrue(os.path.isfile(os.path.join(thumb_dir, f"{doc.id:07}.png")))
+ self.assertTrue(os.path.isfile(doc.thumbnail_path))
+
+ with doc.source_file as f:
+ checksum = hashlib.md5(f.read()).hexdigest()
+ self.assertEqual(checksum, doc.checksum)
+
+
+class TestMakeIndex(TestCase):
+
+ @mock.patch("documents.management.commands.document_index.index_reindex")
+ def test_reindex(self, m):
+ call_command("document_index", "reindex")
+ m.assert_called_once()
+
+ @mock.patch("documents.management.commands.document_index.index_optimize")
+ def test_optimize(self, m):
+ call_command("document_index", "optimize")
+ m.assert_called_once()
+
+
+class TestRenamer(DirectoriesMixin, TestCase):
+
+ def test_rename(self):
+ doc = Document.objects.create(title="test", mime_type="application/pdf")
+ doc.filename = generate_filename(doc)
+ doc.save()
+
+ Path(doc.source_path).touch()
+
+ old_source_path = doc.source_path
+
+ with override_settings(PAPERLESS_FILENAME_FORMAT="{title}"):
+ call_command("document_renamer")
+
+ doc2 = Document.objects.get(id=doc.id)
+
+ self.assertEqual(doc2.filename, "test.pdf")
+ self.assertFalse(os.path.isfile(old_source_path))
+ self.assertFalse(os.path.isfile(doc.source_path))
+ self.assertTrue(os.path.isfile(doc2.source_path))
+
+
+class TestCreateClassifier(TestCase):
+
+ @mock.patch("documents.management.commands.document_create_classifier.train_classifier")
+ def test_create_classifier(self, m):
+ call_command("document_create_classifier")
+
+ m.assert_called_once()
diff --git a/src/documents/tests/test_management_archiver.py b/src/documents/tests/test_management_archiver.py
deleted file mode 100644
index 0828f05ff..000000000
--- a/src/documents/tests/test_management_archiver.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import filecmp
-import os
-import shutil
-
-from django.core.management import call_command
-from django.test import TestCase
-
-from documents.management.commands.document_archiver import handle_document
-from documents.models import Document
-from documents.tests.utils import DirectoriesMixin
-
-
-sample_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")
-
-
-class TestArchiver(DirectoriesMixin, TestCase):
-
- def make_models(self):
- return Document.objects.create(checksum="A", title="A", content="first document", mime_type="application/pdf")
-
- def test_archiver(self):
-
- doc = self.make_models()
- shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf"))
-
- call_command('document_archiver')
-
- def test_handle_document(self):
-
- doc = self.make_models()
- shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf"))
-
- handle_document(doc.pk)
-
- doc = Document.objects.get(id=doc.id)
-
- self.assertIsNotNone(doc.checksum)
- self.assertTrue(os.path.isfile(doc.archive_path))
- self.assertTrue(os.path.isfile(doc.source_path))
- self.assertTrue(filecmp.cmp(sample_file, doc.source_path))
diff --git a/src/documents/tests/test_management_decrypt.py b/src/documents/tests/test_management_decrypt.py
deleted file mode 100644
index 1d64b1105..000000000
--- a/src/documents/tests/test_management_decrypt.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import hashlib
-import json
-import os
-import shutil
-import tempfile
-from unittest import mock
-
-from django.core.management import call_command
-from django.test import TestCase, override_settings
-
-from documents.management.commands import document_exporter
-from documents.models import Document, Tag, DocumentType, Correspondent
-
-
-class TestDecryptDocuments(TestCase):
-
- @override_settings(
- ORIGINALS_DIR=os.path.join(os.path.dirname(__file__), "samples", "originals"),
- THUMBNAIL_DIR=os.path.join(os.path.dirname(__file__), "samples", "thumb"),
- PASSPHRASE="test",
- PAPERLESS_FILENAME_FORMAT=None
- )
- @mock.patch("documents.management.commands.decrypt_documents.input")
- def test_decrypt(self, m):
-
- media_dir = tempfile.mkdtemp()
- originals_dir = os.path.join(media_dir, "documents", "originals")
- thumb_dir = os.path.join(media_dir, "documents", "thumbnails")
- os.makedirs(originals_dir, exist_ok=True)
- os.makedirs(thumb_dir, exist_ok=True)
-
- override_settings(
- ORIGINALS_DIR=originals_dir,
- THUMBNAIL_DIR=thumb_dir,
- PASSPHRASE="test"
- ).enable()
-
- doc = Document.objects.create(checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG)
-
- shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000002.pdf.gpg"), os.path.join(originals_dir, "0000002.pdf.gpg"))
- shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "thumbnails", f"0000002.png.gpg"), os.path.join(thumb_dir, f"{doc.id:07}.png.gpg"))
-
- call_command('decrypt_documents')
-
- doc.refresh_from_db()
-
- self.assertEqual(doc.storage_type, Document.STORAGE_TYPE_UNENCRYPTED)
- self.assertEqual(doc.filename, "0000002.pdf")
- self.assertTrue(os.path.isfile(os.path.join(originals_dir, "0000002.pdf")))
- self.assertTrue(os.path.isfile(doc.source_path))
- self.assertTrue(os.path.isfile(os.path.join(thumb_dir, f"{doc.id:07}.png")))
- self.assertTrue(os.path.isfile(doc.thumbnail_path))
-
- with doc.source_file as f:
- checksum = hashlib.md5(f.read()).hexdigest()
- self.assertEqual(checksum, doc.checksum)
-
diff --git a/src/documents/tests/test_migrations.py b/src/documents/tests/test_migrations.py
new file mode 100644
index 000000000..33ba41444
--- /dev/null
+++ b/src/documents/tests/test_migrations.py
@@ -0,0 +1,129 @@
+import os
+import shutil
+from pathlib import Path
+
+from django.apps import apps
+from django.conf import settings
+from django.db import connection
+from django.db.migrations.executor import MigrationExecutor
+from django.test import TestCase, TransactionTestCase, override_settings
+
+from documents.models import Document
+from documents.parsers import get_default_file_extension
+from documents.tests.utils import DirectoriesMixin
+
+
+class TestMigrations(TransactionTestCase):
+
+ @property
+ def app(self):
+ return apps.get_containing_app_config(type(self).__module__).name
+
+ migrate_from = None
+ migrate_to = None
+
+ def setUp(self):
+ super(TestMigrations, self).setUp()
+
+ assert self.migrate_from and self.migrate_to, \
+ "TestCase '{}' must define migrate_from and migrate_to properties".format(type(self).__name__)
+ self.migrate_from = [(self.app, self.migrate_from)]
+ self.migrate_to = [(self.app, self.migrate_to)]
+ executor = MigrationExecutor(connection)
+ old_apps = executor.loader.project_state(self.migrate_from).apps
+
+ # Reverse to the original migration
+ executor.migrate(self.migrate_from)
+
+ self.setUpBeforeMigration(old_apps)
+
+ # Run the migration to test
+ executor = MigrationExecutor(connection)
+ executor.loader.build_graph() # reload.
+ executor.migrate(self.migrate_to)
+
+ self.apps = executor.loader.project_state(self.migrate_to).apps
+
+ def setUpBeforeMigration(self, apps):
+ pass
+
+
+STORAGE_TYPE_UNENCRYPTED = "unencrypted"
+STORAGE_TYPE_GPG = "gpg"
+
+
+def source_path_before(self):
+ if self.filename:
+ fname = str(self.filename)
+ else:
+ fname = "{:07}.{}".format(self.pk, self.file_type)
+ if self.storage_type == STORAGE_TYPE_GPG:
+ fname += ".gpg"
+
+ return os.path.join(
+ settings.ORIGINALS_DIR,
+ fname
+ )
+
+
+def file_type_after(self):
+ return get_default_file_extension(self.mime_type)
+
+
+def source_path_after(doc):
+ if doc.filename:
+ fname = str(doc.filename)
+ else:
+ fname = "{:07}{}".format(doc.pk, file_type_after(doc))
+ if doc.storage_type == STORAGE_TYPE_GPG:
+ fname += ".gpg" # pragma: no cover
+
+ return os.path.join(
+ settings.ORIGINALS_DIR,
+ fname
+ )
+
+
+@override_settings(PASSPHRASE="test")
+class TestMigrateMimeType(DirectoriesMixin, TestMigrations):
+
+ migrate_from = '1002_auto_20201111_1105'
+ migrate_to = '1003_mime_types'
+
+ def setUpBeforeMigration(self, apps):
+ Document = apps.get_model("documents", "Document")
+ doc = Document.objects.create(title="test", file_type="pdf", filename="file1.pdf")
+ self.doc_id = doc.id
+ shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), source_path_before(doc))
+
+ doc2 = Document.objects.create(checksum="B", file_type="pdf", storage_type=STORAGE_TYPE_GPG)
+ self.doc2_id = doc2.id
+ shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000002.pdf.gpg"), source_path_before(doc2))
+
+ def testMimeTypesMigrated(self):
+ Document = self.apps.get_model('documents', 'Document')
+
+ doc = Document.objects.get(id=self.doc_id)
+ self.assertEqual(doc.mime_type, "application/pdf")
+
+ doc2 = Document.objects.get(id=self.doc2_id)
+ self.assertEqual(doc2.mime_type, "application/pdf")
+
+
+@override_settings(PASSPHRASE="test")
+class TestMigrateMimeTypeBackwards(DirectoriesMixin, TestMigrations):
+
+ migrate_from = '1003_mime_types'
+ migrate_to = '1002_auto_20201111_1105'
+
+ def setUpBeforeMigration(self, apps):
+ Document = apps.get_model("documents", "Document")
+ doc = Document.objects.create(title="test", mime_type="application/pdf", filename="file1.pdf")
+ self.doc_id = doc.id
+ shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), source_path_after(doc))
+
+ def testMimeTypesReverted(self):
+ Document = self.apps.get_model('documents', 'Document')
+
+ doc = Document.objects.get(id=self.doc_id)
+ self.assertEqual(doc.file_type, "pdf")
diff --git a/src/documents/views.py b/src/documents/views.py
index ebe41c9d1..7907c0231 100755
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -389,14 +389,27 @@ class SearchView(APIView):
}
def get(self, request, format=None):
- if 'query' not in request.query_params:
+
+ if 'query' in request.query_params:
+ query = request.query_params['query']
+ else:
+ query = None
+
+ if 'more_like' in request.query_params:
+ more_like_id = request.query_params['more_like']
+ more_like_content = Document.objects.get(id=more_like_id).content
+ else:
+ more_like_id = None
+ more_like_content = None
+
+ if not query and not more_like_id:
return Response({
'count': 0,
'page': 0,
'page_count': 0,
+ 'corrected_query': None,
'results': []})
- query = request.query_params['query']
try:
page = int(request.query_params.get('page', 1))
except (ValueError, TypeError):
@@ -406,8 +419,7 @@ class SearchView(APIView):
page = 1
try:
- with index.query_page(self.ix, query, page) as (result_page,
- corrected_query):
+ with index.query_page(self.ix, page, query, more_like_id, more_like_content) as (result_page, corrected_query): # NOQA: E501
return Response(
{'count': len(result_page),
'page': result_page.pagenum,
diff --git a/src/paperless/checks.py b/src/paperless/checks.py
index 819582ffc..1329ad679 100644
--- a/src/paperless/checks.py
+++ b/src/paperless/checks.py
@@ -13,18 +13,17 @@ writeable_hint = (
)
-def path_check(env_var):
+def path_check(var, directory):
messages = []
- directory = os.getenv(env_var)
if directory:
if not os.path.exists(directory):
messages.append(Error(
- exists_message.format(env_var),
+ exists_message.format(var),
exists_hint.format(directory)
))
elif not os.access(directory, os.W_OK | os.X_OK):
messages.append(Error(
- writeable_message.format(env_var),
+ writeable_message.format(var),
writeable_hint.format(directory)
))
return messages
@@ -36,12 +35,9 @@ def paths_check(app_configs, **kwargs):
Check the various paths for existence, readability and writeability
"""
- check_messages = path_check("PAPERLESS_DATA_DIR") + \
- path_check("PAPERLESS_MEDIA_ROOT") + \
- path_check("PAPERLESS_CONSUMPTION_DIR") + \
- path_check("PAPERLESS_STATICDIR")
-
- return check_messages
+ return path_check("PAPERLESS_DATA_DIR", settings.DATA_DIR) + \
+ path_check("PAPERLESS_MEDIA_ROOT", settings.MEDIA_ROOT) + \
+ path_check("PAPERLESS_CONSUMPTION_DIR", settings.CONSUMPTION_DIR)
@register()
diff --git a/src/paperless/settings.py b/src/paperless/settings.py
index 1a6b80a0c..c6f7c9357 100644
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -160,13 +160,6 @@ if AUTO_LOGIN_USERNAME:
MIDDLEWARE.insert(_index+1, 'paperless.auth.AutoLoginMiddleware')
-if DEBUG:
- X_FRAME_OPTIONS = ''
- # this should really be 'allow-from uri' but its not supported in any mayor
- # browser.
-else:
- X_FRAME_OPTIONS = 'SAMEORIGIN'
-
# We allow CORS from localhost:8080
CORS_ALLOWED_ORIGINS = tuple(os.getenv("PAPERLESS_CORS_ALLOWED_HOSTS", "http://localhost:8000").split(","))
diff --git a/src/paperless/tests/test_checks.py b/src/paperless/tests/test_checks.py
new file mode 100644
index 000000000..e1525cab8
--- /dev/null
+++ b/src/paperless/tests/test_checks.py
@@ -0,0 +1,54 @@
+import os
+import shutil
+
+from django.test import TestCase, override_settings
+
+from documents.tests.utils import DirectoriesMixin
+from paperless import binaries_check, paths_check
+from paperless.checks import debug_mode_check
+
+
+class TestChecks(DirectoriesMixin, TestCase):
+
+ def test_binaries(self):
+ self.assertEqual(binaries_check(None), [])
+
+ @override_settings(CONVERT_BINARY="uuuhh", OPTIPNG_BINARY="forgot")
+ def test_binaries_fail(self):
+ self.assertEqual(len(binaries_check(None)), 2)
+
+ def test_paths_check(self):
+ self.assertEqual(paths_check(None), [])
+
+ @override_settings(MEDIA_ROOT="uuh",
+ DATA_DIR="whatever",
+ CONSUMPTION_DIR="idontcare")
+ def test_paths_check_dont_exist(self):
+ msgs = paths_check(None)
+ self.assertEqual(len(msgs), 3, str(msgs))
+
+ for msg in msgs:
+ self.assertTrue(msg.msg.endswith("is set but doesn't exist."))
+
+ def test_paths_check_no_access(self):
+ os.chmod(self.dirs.data_dir, 0o000)
+ os.chmod(self.dirs.media_dir, 0o000)
+ os.chmod(self.dirs.consumption_dir, 0o000)
+
+ self.addCleanup(os.chmod, self.dirs.data_dir, 0o777)
+ self.addCleanup(os.chmod, self.dirs.media_dir, 0o777)
+ self.addCleanup(os.chmod, self.dirs.consumption_dir, 0o777)
+
+ msgs = paths_check(None)
+ self.assertEqual(len(msgs), 3)
+
+ for msg in msgs:
+ self.assertTrue(msg.msg.endswith("is not writeable"))
+
+ @override_settings(DEBUG=False)
+ def test_debug_disabled(self):
+ self.assertEqual(debug_mode_check(None), [])
+
+ @override_settings(DEBUG=True)
+ def test_debug_enabled(self):
+ self.assertEqual(len(debug_mode_check(None)), 1)
diff --git a/src/paperless_tesseract/checks.py b/src/paperless_tesseract/checks.py
index 41ea3c9b5..d58b7ac6d 100644
--- a/src/paperless_tesseract/checks.py
+++ b/src/paperless_tesseract/checks.py
@@ -1,7 +1,7 @@
import subprocess
from django.conf import settings
-from django.core.checks import Error, register
+from django.core.checks import Error, Warning, register
def get_tesseract_langs():
diff --git a/src/paperless_tesseract/languages.py b/src/paperless_tesseract/languages.py
deleted file mode 100644
index 5ea560654..000000000
--- a/src/paperless_tesseract/languages.py
+++ /dev/null
@@ -1,194 +0,0 @@
-# Thanks to the Library of Congress and some creative use of sed and awk:
-# http://www.loc.gov/standards/iso639-2/php/English_list.php
-
-ISO639 = {
-
- "aa": "aar",
- "ab": "abk",
- "ae": "ave",
- "af": "afr",
- "ak": "aka",
- "am": "amh",
- "an": "arg",
- "ar": "ara",
- "as": "asm",
- "av": "ava",
- "ay": "aym",
- "az": "aze",
- "ba": "bak",
- "be": "bel",
- "bg": "bul",
- "bh": "bih",
- "bi": "bis",
- "bm": "bam",
- "bn": "ben",
- "bo": "bod",
- "br": "bre",
- "bs": "bos",
- "ca": "cat",
- "ce": "che",
- "ch": "cha",
- "co": "cos",
- "cr": "cre",
- "cs": "ces",
- "cu": "chu",
- "cv": "chv",
- "cy": "cym",
- "da": "dan",
- "de": "deu",
- "dv": "div",
- "dz": "dzo",
- "ee": "ewe",
- "el": "ell",
- "en": "eng",
- "eo": "epo",
- "es": "spa",
- "et": "est",
- "eu": "eus",
- "fa": "fas",
- "ff": "ful",
- "fi": "fin",
- "fj": "fij",
- "fo": "fao",
- "fr": "fra",
- "fy": "fry",
- "ga": "gle",
- "gd": "gla",
- "gl": "glg",
- "gn": "grn",
- "gu": "guj",
- "gv": "glv",
- "ha": "hau",
- "he": "heb",
- "hi": "hin",
- "ho": "hmo",
- "hr": "hrv",
- "ht": "hat",
- "hu": "hun",
- "hy": "hye",
- "hz": "her",
- "ia": "ina",
- "id": "ind",
- "ie": "ile",
- "ig": "ibo",
- "ii": "iii",
- "ik": "ipk",
- "io": "ido",
- "is": "isl",
- "it": "ita",
- "iu": "iku",
- "ja": "jpn",
- "jv": "jav",
- "ka": "kat",
- "kg": "kon",
- "ki": "kik",
- "kj": "kua",
- "kk": "kaz",
- "kl": "kal",
- "km": "khm",
- "kn": "kan",
- "ko": "kor",
- "kr": "kau",
- "ks": "kas",
- "ku": "kur",
- "kv": "kom",
- "kw": "cor",
- "ky": "kir",
- "la": "lat",
- "lb": "ltz",
- "lg": "lug",
- "li": "lim",
- "ln": "lin",
- "lo": "lao",
- "lt": "lit",
- "lu": "lub",
- "lv": "lav",
- "mg": "mlg",
- "mh": "mah",
- "mi": "mri",
- "mk": "mkd",
- "ml": "mal",
- "mn": "mon",
- "mr": "mar",
- "ms": "msa",
- "mt": "mlt",
- "my": "mya",
- "na": "nau",
- "nb": "nob",
- "nd": "nde",
- "ne": "nep",
- "ng": "ndo",
- "nl": "nld",
- "no": "nor",
- "nr": "nbl",
- "nv": "nav",
- "ny": "nya",
- "oc": "oci",
- "oj": "oji",
- "om": "orm",
- "or": "ori",
- "os": "oss",
- "pa": "pan",
- "pi": "pli",
- "pl": "pol",
- "ps": "pus",
- "pt": "por",
- "qu": "que",
- "rm": "roh",
- "rn": "run",
- "ro": "ron",
- "ru": "rus",
- "rw": "kin",
- "sa": "san",
- "sc": "srd",
- "sd": "snd",
- "se": "sme",
- "sg": "sag",
- "si": "sin",
- "sk": "slk",
- "sl": "slv",
- "sm": "smo",
- "sn": "sna",
- "so": "som",
- "sq": "sqi",
- "sr": "srp",
- "ss": "ssw",
- "st": "sot",
- "su": "sun",
- "sv": "swe",
- "sw": "swa",
- "ta": "tam",
- "te": "tel",
- "tg": "tgk",
- "th": "tha",
- "ti": "tir",
- "tk": "tuk",
- "tl": "tgl",
- "tn": "tsn",
- "to": "ton",
- "tr": "tur",
- "ts": "tso",
- "tt": "tat",
- "tw": "twi",
- "ty": "tah",
- "ug": "uig",
- "uk": "ukr",
- "ur": "urd",
- "uz": "uzb",
- "ve": "ven",
- "vi": "vie",
- "vo": "vol",
- "wa": "wln",
- "wo": "wol",
- "xh": "xho",
- "yi": "yid",
- "yo": "yor",
- "za": "zha",
-
- # Tessdata contains two values for Chinese, "chi_sim" and "chi_tra". I
- # have no idea which one is better, so I just picked the bigger file.
- "zh": "chi_tra",
-
- "zu": "zul"
-
-}
diff --git a/src/paperless_tesseract/tests/test_checks.py b/src/paperless_tesseract/tests/test_checks.py
new file mode 100644
index 000000000..c4f15764e
--- /dev/null
+++ b/src/paperless_tesseract/tests/test_checks.py
@@ -0,0 +1,26 @@
+from unittest import mock
+
+from django.core.checks import ERROR
+from django.test import TestCase, override_settings
+
+from paperless_tesseract import check_default_language_available
+
+
+class TestChecks(TestCase):
+
+ def test_default_language(self):
+ msgs = check_default_language_available(None)
+
+ @override_settings(OCR_LANGUAGE="")
+ def test_no_language(self):
+ msgs = check_default_language_available(None)
+ self.assertEqual(len(msgs), 1)
+ self.assertTrue(msgs[0].msg.startswith("No OCR language has been specified with PAPERLESS_OCR_LANGUAGE"))
+
+ @override_settings(OCR_LANGUAGE="ita")
+ @mock.patch("paperless_tesseract.checks.get_tesseract_langs")
+ def test_invalid_language(self, m):
+ m.return_value = ["deu", "eng"]
+ msgs = check_default_language_available(None)
+ self.assertEqual(len(msgs), 1)
+ self.assertEqual(msgs[0].level, ERROR)
diff --git a/src/paperless_text/parsers.py b/src/paperless_text/parsers.py
index 7e488ca37..030c2c2c2 100644
--- a/src/paperless_text/parsers.py
+++ b/src/paperless_text/parsers.py
@@ -35,15 +35,3 @@ class TextDocumentParser(DocumentParser):
def parse(self, document_path, mime_type):
with open(document_path, 'r') as f:
self.text = f.read()
-
-
-def run_command(*args):
- environment = os.environ.copy()
- if settings.CONVERT_MEMORY_LIMIT:
- environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
- if settings.CONVERT_TMPDIR:
- environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
-
- if not subprocess.Popen(' '.join(args), env=environment,
- shell=True).wait() == 0:
- raise ParseError("Convert failed at {}".format(args))