diff --git a/docs/reference.md b/docs/reference.md
index 78249a8..f86ad64 100644
--- a/docs/reference.md
+++ b/docs/reference.md
@@ -1,5 +1,44 @@
# Reference
+## Setup
+
+We'll use a test index for the examples that follow.
+
+```python
+import os
+from tantivy import SchemaBuilder, Index, Document
+schema = (
+ SchemaBuilder()
+ .add_integer_field("doc_id", indexed=True, stored=True)
+ .add_text_field("title", stored=True)
+ .add_text_field("body")
+ .build()
+)
+index = Index(schema=schema, path=None)
+writer = index.writer(heap_size=15_000_000, num_threads=1)
+doc = Document()
+doc.add_integer("doc_id", 1)
+doc.add_text("title", "The Old Man and the Sea")
+doc.add_text(
+ "body",
+ (
+ "He was an old man who fished alone in a skiff in"
+ "the Gulf Stream and he had gone eighty-four days "
+ "now without taking a fish."
+ ),
+)
+writer.add_document(doc)
+
+doc = Document()
+doc.add_integer("doc_id", 2)
+doc.add_text("title", "The Old Man and the Sea II")
+doc.add_text("body", "He was an old man who sailed alone.")
+
+writer.add_document(doc)
+writer.commit()
+index.reload()
+```
+
## Valid Query Formats
tantivy-py supports the [query language](https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html#method.parse_query) used in tantivy.
@@ -7,6 +46,7 @@ Below a few basic query formats are shown:
- AND and OR conjunctions.
```python
+searcher = index.searcher()
query = index.parse_query('(Old AND Man) OR Stream', ["title", "body"])
(best_score, best_doc_address) = searcher.search(query, 3).hits[0]
best_doc = searcher.doc(best_doc_address)
@@ -29,7 +69,7 @@ best_doc = searcher.doc(best_doc_address)
- integer search
```python
-query = index.parse_query('"eighty-four days"', ["doc_id"])
+query = index.parse_query('1', ["doc_id"])
(best_score, best_doc_address) = searcher.search(query, 3).hits[0]
best_doc = searcher.doc(best_doc_address)
```
@@ -46,8 +86,10 @@ the search query in additional quotes, as if a phrase query were being used.
The following will NOT work:
```python
-# Raises ValueError
-index.parse_query(r'sea\"', ["title", "body"])
+try:
+ index.parse_query(r'sea\"', ["title", "body"])
+except ValueError as e:
+ assert str(e) == r'Syntax Error: sea\"'
```
However, the following will succeed:
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 1a16205..eeac6f6 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1 +1,2 @@
mkdocs==1.4.3
+mktestdocs==0.2.1
diff --git a/docs/tutorials.md b/docs/tutorials.md
index 1e07a89..8388b98 100644
--- a/docs/tutorials.md
+++ b/docs/tutorials.md
@@ -3,6 +3,8 @@
## Building an index and populating it
```python
+import tempfile
+import pathlib
import tantivy
# Declaring our schema.
@@ -20,7 +22,10 @@ To have a persistent index, use the path
parameter to store the index on the disk, e.g:
```python
-index = tantivy.Index(schema, path=os.getcwd() + '/index')
+tmpdir = tempfile.TemporaryDirectory()
+index_path = pathlib.Path(tmpdir.name) / "index"
+index_path.mkdir()
+persistent_index = tantivy.Index(schema, path=str(index_path))
```
By default, tantivy offers the following tokenizers
@@ -44,7 +49,8 @@ which can be used in tantivy-py:
to use the above tokenizers, simply provide them as a parameter to `add_text_field`. e.g.
```python
-schema_builder.add_text_field("body", stored=True, tokenizer_name='en_stem')
+schema_builder_tok = tantivy.SchemaBuilder()
+schema_builder_tok.add_text_field("body", stored=True, tokenizer_name='en_stem')
```
## Adding one document.
@@ -77,6 +83,46 @@ query = index.parse_query("fish days", ["title", "body"])
(best_score, best_doc_address) = searcher.search(query, 3).hits[0]
best_doc = searcher.doc(best_doc_address)
assert best_doc["title"] == ["The Old Man and the Sea"]
-print(best_doc)
```
+## Using the snippet generator
+
+```python
+hit_text = best_doc["body"][0]
+print(f"{hit_text=}")
+assert hit_text == (
+ "He was an old man who fished alone in a skiff in the "
+ "Gulf Stream and he had gone eighty-four days now "
+ "without taking a fish."
+)
+
+from tantivy import SnippetGenerator
+snippet_generator = SnippetGenerator.create(
+ searcher, query, schema, "body"
+)
+snippet = snippet_generator.snippet_from_doc(best_doc)
+```
+
+The snippet object provides the hit ranges. These are the marker
+offsets in the text that match the query.
+
+```python
+highlights = snippet.highlighted()
+first_highlight = highlights[0]
+assert first_highlight.start == 93
+assert first_highlight.end == 97
+assert hit_text[first_highlight.start:first_highlight.end] == "days"
+```
+
+The snippet object can also generate a marked-up HTML snippet:
+
+```python
+html_snippet = snippet.to_html()
+assert html_snippet == (
+ "He was an old man who fished alone in a skiff in the "
+ "Gulf Stream and he had gone eighty-four days now "
+ "without taking a fish"
+)
+```
+
+
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 20faf86..f64104f 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,2 +1,3 @@
maturin
pytest>=4.0
+mktestdocs==0.2.1
diff --git a/tests/test_docs.py b/tests/test_docs.py
new file mode 100644
index 0000000..9d059b3
--- /dev/null
+++ b/tests/test_docs.py
@@ -0,0 +1,12 @@
+from pathlib import Path
+import pytest
+
+from mktestdocs import check_md_file
+
+def test_hello():
+ assert True
+
+
+@pytest.mark.parametrize("filepath", Path("docs").glob("**/*.md"), ids=str)
+def test_docs(filepath):
+ check_md_file(filepath, memory=True)