Snippet generator (fixes #36 and #63) (#122)

Co-authored-by: Justin Greene <justin.greene@intouchsol.com>
Co-authored-by: Paul Masurel <paul@quickwit.io>
Co-authored-by: Pratyush Mittal <pratyushmittal@gmail.com>
Co-authored-by: mukeshsahnis <er.mks89@gmail.com>
master
Caleb Hattingh 2023-09-13 12:03:39 +02:00 committed by GitHub
parent 72e221033c
commit f164b0ef4a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 132 additions and 6 deletions

6
.gitignore vendored
View File

@ -9,6 +9,6 @@ tantivy.so
tantivy.dylib
tantivy/tantivy.cpython*.so
tantivy.egg-info/
# Exclude the mkdocs site directory
site/
.venv
.envrc
site/

View File

@ -23,6 +23,21 @@ only supports python3.
# Development
For compiling Python module:
```bash
# create virtual env
python -m venv .venv
source .venv/bin/activate
# install maturin, the build tool for PyO3
pip install maturin
# compile and install python module in venv
maturin develop
```
Setting up a development environment can be done in a virtual environment using
[`nox`](https://nox.thea.codes) or using local packages using the provided `Makefile`.

View File

@ -797,7 +797,7 @@ impl Document {
Ok(())
}
fn iter_values_for_field<'a>(
pub fn iter_values_for_field<'a>(
&'a self,
field: &str,
) -> impl Iterator<Item = &'a Value> + 'a {

View File

@ -8,13 +8,16 @@ mod query;
mod schema;
mod schemabuilder;
mod searcher;
mod snippet;
use document::Document;
use facet::Facet;
use index::Index;
use query::Query;
use schema::Schema;
use schemabuilder::SchemaBuilder;
use searcher::{DocAddress, SearchResult, Searcher};
use snippet::{Snippet, SnippetGenerator};
/// Python bindings for the search engine library Tantivy.
///
@ -76,6 +79,9 @@ fn tantivy(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<Index>()?;
m.add_class::<DocAddress>()?;
m.add_class::<Facet>()?;
m.add_class::<Query>()?;
m.add_class::<Snippet>()?;
m.add_class::<SnippetGenerator>()?;
Ok(())
}

81
src/snippet.rs Normal file
View File

@ -0,0 +1,81 @@
use crate::to_pyerr;
use pyo3::prelude::*;
use tantivy as tv;
/// Tantivy schema.
///
/// The schema is very strict. To build the schema the `SchemaBuilder` class is
/// provided.
#[pyclass]
pub(crate) struct Snippet {
pub(crate) inner: tv::Snippet,
}
#[pyclass]
pub(crate) struct Range {
#[pyo3(get)]
start: usize,
#[pyo3(get)]
end: usize,
}
#[pymethods]
impl Snippet {
pub fn to_html(&self) -> PyResult<String> {
Ok(self.inner.to_html())
}
pub fn highlighted(&self) -> Vec<Range> {
let highlighted = self.inner.highlighted();
let results = highlighted
.iter()
.map(|r| Range {
start: r.start,
end: r.end,
})
.collect::<Vec<_>>();
results
}
}
#[pyclass]
pub(crate) struct SnippetGenerator {
pub(crate) field_name: String,
pub(crate) inner: tv::SnippetGenerator,
}
#[pymethods]
impl SnippetGenerator {
#[staticmethod]
pub fn create(
searcher: &crate::Searcher,
query: &crate::Query,
schema: &crate::Schema,
field_name: &str,
) -> PyResult<SnippetGenerator> {
let field = schema
.inner
.get_field(field_name)
.or(Err("field not found"))
.map_err(to_pyerr)?;
let generator =
tv::SnippetGenerator::create(&searcher.inner, query.get(), field)
.map_err(to_pyerr)?;
return Ok(SnippetGenerator {
field_name: field_name.to_string(),
inner: generator,
});
}
pub fn snippet_from_doc(&self, doc: &crate::Document) -> crate::Snippet {
let text: String = doc
.iter_values_for_field(&self.field_name)
.flat_map(tv::schema::Value::as_text)
.collect::<Vec<&str>>()
.join(" ");
let result = self.inner.snippet(&text);
Snippet { inner: result }
}
}

View File

@ -5,8 +5,8 @@ import datetime
import tantivy
import pickle
import pytest
from tantivy import Document, Index, SchemaBuilder
import tantivy
from tantivy import Document, Index, SchemaBuilder, SnippetGenerator
def schema():
@ -784,3 +784,27 @@ def test_doc_address_pickle():
pickled = pickle.loads(pickle.dumps(orig))
assert orig == pickled
class TestSnippets(object):
def test_document_snippet(self, dir_index):
index_dir, _ = dir_index
doc_schema = schema()
index = Index(doc_schema, str(index_dir))
query = index.parse_query("sea whale", ["title", "body"])
searcher = index.searcher()
result = searcher.search(query)
assert len(result.hits) == 1
snippet_generator = SnippetGenerator.create(searcher, query, doc_schema, "title")
for (score, doc_address) in result.hits:
doc = searcher.doc(doc_address)
snippet = snippet_generator.snippet_from_doc(doc)
highlights = snippet.highlighted()
assert len(highlights) == 1
first = highlights[0]
assert first.start == 20
assert first.end == 23
html_snippet = snippet.to_html()
assert html_snippet == 'The Old Man and the <b>Sea</b>'