doc: Add mktestdocs, and tutorial for snippets. Fixes #219 (#223)

2024-03-20 01:06:31 +01:00 · 2024-03-20 01:06:31 +01:00 · a2549a958a
commit a2549a958a
parent 61f37e77e3
5 changed files with 108 additions and 6 deletions
--- a/docs/reference.md
+++ b/docs/reference.md
@ -1,5 +1,44 @@
 # Reference
 ## Setup
 We'll use a test index for the examples that follow.
 ```python
 import os
 from tantivy import SchemaBuilder, Index, Document
 schema = (
    SchemaBuilder()
        .add_integer_field("doc_id", indexed=True, stored=True)
        .add_text_field("title", stored=True)
        .add_text_field("body")
        .build()
 )
 index = Index(schema=schema, path=None)
 writer = index.writer(heap_size=15_000_000, num_threads=1)
 doc = Document()
 doc.add_integer("doc_id", 1)
 doc.add_text("title", "The Old Man and the Sea")
 doc.add_text(
    "body",
    (
        "He was an old man who fished alone in a skiff in"
        "the Gulf Stream and he had gone eighty-four days "
        "now without taking a fish."
    ),
 )
 writer.add_document(doc)
 doc = Document()
 doc.add_integer("doc_id", 2)
 doc.add_text("title", "The Old Man and the Sea II")
 doc.add_text("body", "He was an old man who sailed alone.")
 writer.add_document(doc)
 writer.commit()
 index.reload()
 ```
 ## Valid Query Formats
 tantivy-py supports the [query language](https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html#method.parse_query) used in tantivy.
@ -7,6 +46,7 @@ Below a few basic query formats are shown:
 - AND and OR conjunctions.
 ```python
 searcher = index.searcher()
 query = index.parse_query('(Old AND Man) OR Stream', ["title", "body"])
 (best_score, best_doc_address) = searcher.search(query, 3).hits[0]
 best_doc = searcher.doc(best_doc_address)
@ -29,7 +69,7 @@ best_doc = searcher.doc(best_doc_address)
 - integer search
 ```python
-query = index.parse_query('"eighty-four days"', ["doc_id"])
+query = index.parse_query('1', ["doc_id"])
 (best_score, best_doc_address) = searcher.search(query, 3).hits[0]
 best_doc = searcher.doc(best_doc_address)
 ```
@ -46,8 +86,10 @@ the search query in additional quotes, as if a phrase query were being used.
 The following will NOT work:
 ```python
-# Raises ValueError
+try:
    index.parse_query(r'sea\"', ["title", "body"])
 except ValueError as e:
    assert str(e) == r'Syntax Error: sea\"'
 ```
 However, the following will succeed:
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@ -1 +1,2 @@
 mkdocs==1.4.3
 mktestdocs==0.2.1
--- a/docs/tutorials.md
+++ b/docs/tutorials.md
@ -3,6 +3,8 @@
 ## Building an index and populating it
 ```python
 import tempfile
 import pathlib
 import tantivy
 # Declaring our schema.
@ -20,7 +22,10 @@ To have a persistent index, use the path
 parameter to store the index on the disk, e.g:
 ```python
-index = tantivy.Index(schema, path=os.getcwd() + '/index')
+tmpdir = tempfile.TemporaryDirectory()
 index_path = pathlib.Path(tmpdir.name) / "index"
 index_path.mkdir()
 persistent_index = tantivy.Index(schema, path=str(index_path))
 ```
 By default, tantivy  offers the following tokenizers
@ -44,7 +49,8 @@ which can be used in tantivy-py:
 to use the above tokenizers, simply provide them as a parameter to `add_text_field`. e.g.
 ```python
-schema_builder.add_text_field("body",  stored=True,  tokenizer_name='en_stem')
+schema_builder_tok = tantivy.SchemaBuilder()
 schema_builder_tok.add_text_field("body",  stored=True,  tokenizer_name='en_stem')
 ```
 ## Adding one document.
@ -77,6 +83,46 @@ query = index.parse_query("fish days", ["title", "body"])
 (best_score, best_doc_address) = searcher.search(query, 3).hits[0]
 best_doc = searcher.doc(best_doc_address)
 assert best_doc["title"] == ["The Old Man and the Sea"]
 print(best_doc)
 ```
 ## Using the snippet generator
 ```python
 hit_text = best_doc["body"][0]
 print(f"{hit_text=}")
 assert hit_text == (
    "He was an old man who fished alone in a skiff in the "
    "Gulf Stream and he had gone eighty-four days now "
    "without taking a fish."
 )
 from tantivy import SnippetGenerator
 snippet_generator = SnippetGenerator.create(
    searcher, query, schema, "body"
 )
 snippet = snippet_generator.snippet_from_doc(best_doc)
 ```
 The snippet object provides the hit ranges. These are the marker
 offsets in the text that match the query.
 ```python
 highlights = snippet.highlighted()
 first_highlight = highlights[0]
 assert first_highlight.start == 93
 assert first_highlight.end == 97
 assert hit_text[first_highlight.start:first_highlight.end] == "days"
 ```
 The snippet object can also generate a marked-up HTML snippet:
 ```python
 html_snippet = snippet.to_html()
 assert html_snippet == (
    "He was an old man who fished alone in a skiff in the "
    "Gulf Stream and he had gone eighty-four <b>days</b> now "
    "without taking a <b>fish</b>"
 )
 ```
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -1,2 +1,3 @@
 maturin
 pytest>=4.0
 mktestdocs==0.2.1
--- a/tests/test_docs.py
+++ b/tests/test_docs.py
@ -0,0 +1,12 @@
 from pathlib import Path
 import pytest
 from mktestdocs import check_md_file
 def test_hello():
    assert True
@pytest.mark.parametrize("filepath", Path("docs").glob("**/*.md"), ids=str)
 def test_docs(filepath):
    check_md_file(filepath, memory=True)
`@ -1 +1,2 @@`
	`mkdocs==1.4.3`	`mkdocs==1.4.3`
		`mktestdocs==0.2.1`