From a2549a958ab827b4144f667ad65eda0fede5af01 Mon Sep 17 00:00:00 2001 From: Caleb Hattingh Date: Wed, 20 Mar 2024 01:06:31 +0100 Subject: [PATCH] doc: Add mktestdocs, and tutorial for snippets. Fixes #219 (#223) --- docs/reference.md | 48 ++++++++++++++++++++++++++++++++++++--- docs/requirements.txt | 1 + docs/tutorials.md | 52 ++++++++++++++++++++++++++++++++++++++++--- requirements-dev.txt | 1 + tests/test_docs.py | 12 ++++++++++ 5 files changed, 108 insertions(+), 6 deletions(-) create mode 100644 tests/test_docs.py diff --git a/docs/reference.md b/docs/reference.md index 78249a8..f86ad64 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -1,5 +1,44 @@ # Reference +## Setup + +We'll use a test index for the examples that follow. + +```python +import os +from tantivy import SchemaBuilder, Index, Document +schema = ( + SchemaBuilder() + .add_integer_field("doc_id", indexed=True, stored=True) + .add_text_field("title", stored=True) + .add_text_field("body") + .build() +) +index = Index(schema=schema, path=None) +writer = index.writer(heap_size=15_000_000, num_threads=1) +doc = Document() +doc.add_integer("doc_id", 1) +doc.add_text("title", "The Old Man and the Sea") +doc.add_text( + "body", + ( + "He was an old man who fished alone in a skiff in" + "the Gulf Stream and he had gone eighty-four days " + "now without taking a fish." + ), +) +writer.add_document(doc) + +doc = Document() +doc.add_integer("doc_id", 2) +doc.add_text("title", "The Old Man and the Sea II") +doc.add_text("body", "He was an old man who sailed alone.") + +writer.add_document(doc) +writer.commit() +index.reload() +``` + ## Valid Query Formats tantivy-py supports the [query language](https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html#method.parse_query) used in tantivy. @@ -7,6 +46,7 @@ Below a few basic query formats are shown: - AND and OR conjunctions. ```python +searcher = index.searcher() query = index.parse_query('(Old AND Man) OR Stream', ["title", "body"]) (best_score, best_doc_address) = searcher.search(query, 3).hits[0] best_doc = searcher.doc(best_doc_address) @@ -29,7 +69,7 @@ best_doc = searcher.doc(best_doc_address) - integer search ```python -query = index.parse_query('"eighty-four days"', ["doc_id"]) +query = index.parse_query('1', ["doc_id"]) (best_score, best_doc_address) = searcher.search(query, 3).hits[0] best_doc = searcher.doc(best_doc_address) ``` @@ -46,8 +86,10 @@ the search query in additional quotes, as if a phrase query were being used. The following will NOT work: ```python -# Raises ValueError -index.parse_query(r'sea\"', ["title", "body"]) +try: + index.parse_query(r'sea\"', ["title", "body"]) +except ValueError as e: + assert str(e) == r'Syntax Error: sea\"' ``` However, the following will succeed: diff --git a/docs/requirements.txt b/docs/requirements.txt index 1a16205..eeac6f6 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1 +1,2 @@ mkdocs==1.4.3 +mktestdocs==0.2.1 diff --git a/docs/tutorials.md b/docs/tutorials.md index 1e07a89..8388b98 100644 --- a/docs/tutorials.md +++ b/docs/tutorials.md @@ -3,6 +3,8 @@ ## Building an index and populating it ```python +import tempfile +import pathlib import tantivy # Declaring our schema. @@ -20,7 +22,10 @@ To have a persistent index, use the path parameter to store the index on the disk, e.g: ```python -index = tantivy.Index(schema, path=os.getcwd() + '/index') +tmpdir = tempfile.TemporaryDirectory() +index_path = pathlib.Path(tmpdir.name) / "index" +index_path.mkdir() +persistent_index = tantivy.Index(schema, path=str(index_path)) ``` By default, tantivy offers the following tokenizers @@ -44,7 +49,8 @@ which can be used in tantivy-py: to use the above tokenizers, simply provide them as a parameter to `add_text_field`. e.g. ```python -schema_builder.add_text_field("body", stored=True, tokenizer_name='en_stem') +schema_builder_tok = tantivy.SchemaBuilder() +schema_builder_tok.add_text_field("body", stored=True, tokenizer_name='en_stem') ``` ## Adding one document. @@ -77,6 +83,46 @@ query = index.parse_query("fish days", ["title", "body"]) (best_score, best_doc_address) = searcher.search(query, 3).hits[0] best_doc = searcher.doc(best_doc_address) assert best_doc["title"] == ["The Old Man and the Sea"] -print(best_doc) ``` +## Using the snippet generator + +```python +hit_text = best_doc["body"][0] +print(f"{hit_text=}") +assert hit_text == ( + "He was an old man who fished alone in a skiff in the " + "Gulf Stream and he had gone eighty-four days now " + "without taking a fish." +) + +from tantivy import SnippetGenerator +snippet_generator = SnippetGenerator.create( + searcher, query, schema, "body" +) +snippet = snippet_generator.snippet_from_doc(best_doc) +``` + +The snippet object provides the hit ranges. These are the marker +offsets in the text that match the query. + +```python +highlights = snippet.highlighted() +first_highlight = highlights[0] +assert first_highlight.start == 93 +assert first_highlight.end == 97 +assert hit_text[first_highlight.start:first_highlight.end] == "days" +``` + +The snippet object can also generate a marked-up HTML snippet: + +```python +html_snippet = snippet.to_html() +assert html_snippet == ( + "He was an old man who fished alone in a skiff in the " + "Gulf Stream and he had gone eighty-four days now " + "without taking a fish" +) +``` + + diff --git a/requirements-dev.txt b/requirements-dev.txt index 20faf86..f64104f 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,2 +1,3 @@ maturin pytest>=4.0 +mktestdocs==0.2.1 diff --git a/tests/test_docs.py b/tests/test_docs.py new file mode 100644 index 0000000..9d059b3 --- /dev/null +++ b/tests/test_docs.py @@ -0,0 +1,12 @@ +from pathlib import Path +import pytest + +from mktestdocs import check_md_file + +def test_hello(): + assert True + + +@pytest.mark.parametrize("filepath", Path("docs").glob("**/*.md"), ids=str) +def test_docs(filepath): + check_md_file(filepath, memory=True)