parent
61f37e77e3
commit
a2549a958a
|
@ -1,5 +1,44 @@
|
||||||
# Reference
|
# Reference
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
We'll use a test index for the examples that follow.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import os
|
||||||
|
from tantivy import SchemaBuilder, Index, Document
|
||||||
|
schema = (
|
||||||
|
SchemaBuilder()
|
||||||
|
.add_integer_field("doc_id", indexed=True, stored=True)
|
||||||
|
.add_text_field("title", stored=True)
|
||||||
|
.add_text_field("body")
|
||||||
|
.build()
|
||||||
|
)
|
||||||
|
index = Index(schema=schema, path=None)
|
||||||
|
writer = index.writer(heap_size=15_000_000, num_threads=1)
|
||||||
|
doc = Document()
|
||||||
|
doc.add_integer("doc_id", 1)
|
||||||
|
doc.add_text("title", "The Old Man and the Sea")
|
||||||
|
doc.add_text(
|
||||||
|
"body",
|
||||||
|
(
|
||||||
|
"He was an old man who fished alone in a skiff in"
|
||||||
|
"the Gulf Stream and he had gone eighty-four days "
|
||||||
|
"now without taking a fish."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
writer.add_document(doc)
|
||||||
|
|
||||||
|
doc = Document()
|
||||||
|
doc.add_integer("doc_id", 2)
|
||||||
|
doc.add_text("title", "The Old Man and the Sea II")
|
||||||
|
doc.add_text("body", "He was an old man who sailed alone.")
|
||||||
|
|
||||||
|
writer.add_document(doc)
|
||||||
|
writer.commit()
|
||||||
|
index.reload()
|
||||||
|
```
|
||||||
|
|
||||||
## Valid Query Formats
|
## Valid Query Formats
|
||||||
|
|
||||||
tantivy-py supports the [query language](https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html#method.parse_query) used in tantivy.
|
tantivy-py supports the [query language](https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html#method.parse_query) used in tantivy.
|
||||||
|
@ -7,6 +46,7 @@ Below a few basic query formats are shown:
|
||||||
|
|
||||||
- AND and OR conjunctions.
|
- AND and OR conjunctions.
|
||||||
```python
|
```python
|
||||||
|
searcher = index.searcher()
|
||||||
query = index.parse_query('(Old AND Man) OR Stream', ["title", "body"])
|
query = index.parse_query('(Old AND Man) OR Stream', ["title", "body"])
|
||||||
(best_score, best_doc_address) = searcher.search(query, 3).hits[0]
|
(best_score, best_doc_address) = searcher.search(query, 3).hits[0]
|
||||||
best_doc = searcher.doc(best_doc_address)
|
best_doc = searcher.doc(best_doc_address)
|
||||||
|
@ -29,7 +69,7 @@ best_doc = searcher.doc(best_doc_address)
|
||||||
|
|
||||||
- integer search
|
- integer search
|
||||||
```python
|
```python
|
||||||
query = index.parse_query('"eighty-four days"', ["doc_id"])
|
query = index.parse_query('1', ["doc_id"])
|
||||||
(best_score, best_doc_address) = searcher.search(query, 3).hits[0]
|
(best_score, best_doc_address) = searcher.search(query, 3).hits[0]
|
||||||
best_doc = searcher.doc(best_doc_address)
|
best_doc = searcher.doc(best_doc_address)
|
||||||
```
|
```
|
||||||
|
@ -46,8 +86,10 @@ the search query in additional quotes, as if a phrase query were being used.
|
||||||
The following will NOT work:
|
The following will NOT work:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# Raises ValueError
|
try:
|
||||||
index.parse_query(r'sea\"', ["title", "body"])
|
index.parse_query(r'sea\"', ["title", "body"])
|
||||||
|
except ValueError as e:
|
||||||
|
assert str(e) == r'Syntax Error: sea\"'
|
||||||
```
|
```
|
||||||
|
|
||||||
However, the following will succeed:
|
However, the following will succeed:
|
||||||
|
|
|
@ -1 +1,2 @@
|
||||||
mkdocs==1.4.3
|
mkdocs==1.4.3
|
||||||
|
mktestdocs==0.2.1
|
||||||
|
|
|
@ -3,6 +3,8 @@
|
||||||
## Building an index and populating it
|
## Building an index and populating it
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
import tempfile
|
||||||
|
import pathlib
|
||||||
import tantivy
|
import tantivy
|
||||||
|
|
||||||
# Declaring our schema.
|
# Declaring our schema.
|
||||||
|
@ -20,7 +22,10 @@ To have a persistent index, use the path
|
||||||
parameter to store the index on the disk, e.g:
|
parameter to store the index on the disk, e.g:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
index = tantivy.Index(schema, path=os.getcwd() + '/index')
|
tmpdir = tempfile.TemporaryDirectory()
|
||||||
|
index_path = pathlib.Path(tmpdir.name) / "index"
|
||||||
|
index_path.mkdir()
|
||||||
|
persistent_index = tantivy.Index(schema, path=str(index_path))
|
||||||
```
|
```
|
||||||
|
|
||||||
By default, tantivy offers the following tokenizers
|
By default, tantivy offers the following tokenizers
|
||||||
|
@ -44,7 +49,8 @@ which can be used in tantivy-py:
|
||||||
|
|
||||||
to use the above tokenizers, simply provide them as a parameter to `add_text_field`. e.g.
|
to use the above tokenizers, simply provide them as a parameter to `add_text_field`. e.g.
|
||||||
```python
|
```python
|
||||||
schema_builder.add_text_field("body", stored=True, tokenizer_name='en_stem')
|
schema_builder_tok = tantivy.SchemaBuilder()
|
||||||
|
schema_builder_tok.add_text_field("body", stored=True, tokenizer_name='en_stem')
|
||||||
```
|
```
|
||||||
|
|
||||||
## Adding one document.
|
## Adding one document.
|
||||||
|
@ -77,6 +83,46 @@ query = index.parse_query("fish days", ["title", "body"])
|
||||||
(best_score, best_doc_address) = searcher.search(query, 3).hits[0]
|
(best_score, best_doc_address) = searcher.search(query, 3).hits[0]
|
||||||
best_doc = searcher.doc(best_doc_address)
|
best_doc = searcher.doc(best_doc_address)
|
||||||
assert best_doc["title"] == ["The Old Man and the Sea"]
|
assert best_doc["title"] == ["The Old Man and the Sea"]
|
||||||
print(best_doc)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Using the snippet generator
|
||||||
|
|
||||||
|
```python
|
||||||
|
hit_text = best_doc["body"][0]
|
||||||
|
print(f"{hit_text=}")
|
||||||
|
assert hit_text == (
|
||||||
|
"He was an old man who fished alone in a skiff in the "
|
||||||
|
"Gulf Stream and he had gone eighty-four days now "
|
||||||
|
"without taking a fish."
|
||||||
|
)
|
||||||
|
|
||||||
|
from tantivy import SnippetGenerator
|
||||||
|
snippet_generator = SnippetGenerator.create(
|
||||||
|
searcher, query, schema, "body"
|
||||||
|
)
|
||||||
|
snippet = snippet_generator.snippet_from_doc(best_doc)
|
||||||
|
```
|
||||||
|
|
||||||
|
The snippet object provides the hit ranges. These are the marker
|
||||||
|
offsets in the text that match the query.
|
||||||
|
|
||||||
|
```python
|
||||||
|
highlights = snippet.highlighted()
|
||||||
|
first_highlight = highlights[0]
|
||||||
|
assert first_highlight.start == 93
|
||||||
|
assert first_highlight.end == 97
|
||||||
|
assert hit_text[first_highlight.start:first_highlight.end] == "days"
|
||||||
|
```
|
||||||
|
|
||||||
|
The snippet object can also generate a marked-up HTML snippet:
|
||||||
|
|
||||||
|
```python
|
||||||
|
html_snippet = snippet.to_html()
|
||||||
|
assert html_snippet == (
|
||||||
|
"He was an old man who fished alone in a skiff in the "
|
||||||
|
"Gulf Stream and he had gone eighty-four <b>days</b> now "
|
||||||
|
"without taking a <b>fish</b>"
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,2 +1,3 @@
|
||||||
maturin
|
maturin
|
||||||
pytest>=4.0
|
pytest>=4.0
|
||||||
|
mktestdocs==0.2.1
|
||||||
|
|
|
@ -0,0 +1,12 @@
|
||||||
|
from pathlib import Path
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from mktestdocs import check_md_file
|
||||||
|
|
||||||
|
def test_hello():
|
||||||
|
assert True
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("filepath", Path("docs").glob("**/*.md"), ids=str)
|
||||||
|
def test_docs(filepath):
|
||||||
|
check_md_file(filepath, memory=True)
|
Loading…
Reference in New Issue