Snippet generator (fixes #36 and #63) (#122)

Co-authored-by: Justin Greene <justin.greene@intouchsol.com>
Co-authored-by: Paul Masurel <paul@quickwit.io>
Co-authored-by: Pratyush Mittal <pratyushmittal@gmail.com>
Co-authored-by: mukeshsahnis <er.mks89@gmail.com>
master
Caleb Hattingh 2023-09-13 12:03:39 +02:00 committed by GitHub
parent 72e221033c
commit f164b0ef4a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 132 additions and 6 deletions

4
.gitignore vendored
View File

@ -9,6 +9,6 @@ tantivy.so
tantivy.dylib tantivy.dylib
tantivy/tantivy.cpython*.so tantivy/tantivy.cpython*.so
tantivy.egg-info/ tantivy.egg-info/
.venv
# Exclude the mkdocs site directory .envrc
site/ site/

View File

@ -23,6 +23,21 @@ only supports python3.
# Development # Development
For compiling Python module:
```bash
# create virtual env
python -m venv .venv
source .venv/bin/activate
# install maturin, the build tool for PyO3
pip install maturin
# compile and install python module in venv
maturin develop
```
Setting up a development environment can be done in a virtual environment using Setting up a development environment can be done in a virtual environment using
[`nox`](https://nox.thea.codes) or using local packages using the provided `Makefile`. [`nox`](https://nox.thea.codes) or using local packages using the provided `Makefile`.

View File

@ -797,7 +797,7 @@ impl Document {
Ok(()) Ok(())
} }
fn iter_values_for_field<'a>( pub fn iter_values_for_field<'a>(
&'a self, &'a self,
field: &str, field: &str,
) -> impl Iterator<Item = &'a Value> + 'a { ) -> impl Iterator<Item = &'a Value> + 'a {

View File

@ -8,13 +8,16 @@ mod query;
mod schema; mod schema;
mod schemabuilder; mod schemabuilder;
mod searcher; mod searcher;
mod snippet;
use document::Document; use document::Document;
use facet::Facet; use facet::Facet;
use index::Index; use index::Index;
use query::Query;
use schema::Schema; use schema::Schema;
use schemabuilder::SchemaBuilder; use schemabuilder::SchemaBuilder;
use searcher::{DocAddress, SearchResult, Searcher}; use searcher::{DocAddress, SearchResult, Searcher};
use snippet::{Snippet, SnippetGenerator};
/// Python bindings for the search engine library Tantivy. /// Python bindings for the search engine library Tantivy.
/// ///
@ -76,6 +79,9 @@ fn tantivy(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<Index>()?; m.add_class::<Index>()?;
m.add_class::<DocAddress>()?; m.add_class::<DocAddress>()?;
m.add_class::<Facet>()?; m.add_class::<Facet>()?;
m.add_class::<Query>()?;
m.add_class::<Snippet>()?;
m.add_class::<SnippetGenerator>()?;
Ok(()) Ok(())
} }

81
src/snippet.rs Normal file
View File

@ -0,0 +1,81 @@
use crate::to_pyerr;
use pyo3::prelude::*;
use tantivy as tv;
/// Tantivy schema.
///
/// The schema is very strict. To build the schema the `SchemaBuilder` class is
/// provided.
#[pyclass]
pub(crate) struct Snippet {
pub(crate) inner: tv::Snippet,
}
#[pyclass]
pub(crate) struct Range {
#[pyo3(get)]
start: usize,
#[pyo3(get)]
end: usize,
}
#[pymethods]
impl Snippet {
pub fn to_html(&self) -> PyResult<String> {
Ok(self.inner.to_html())
}
pub fn highlighted(&self) -> Vec<Range> {
let highlighted = self.inner.highlighted();
let results = highlighted
.iter()
.map(|r| Range {
start: r.start,
end: r.end,
})
.collect::<Vec<_>>();
results
}
}
#[pyclass]
pub(crate) struct SnippetGenerator {
pub(crate) field_name: String,
pub(crate) inner: tv::SnippetGenerator,
}
#[pymethods]
impl SnippetGenerator {
#[staticmethod]
pub fn create(
searcher: &crate::Searcher,
query: &crate::Query,
schema: &crate::Schema,
field_name: &str,
) -> PyResult<SnippetGenerator> {
let field = schema
.inner
.get_field(field_name)
.or(Err("field not found"))
.map_err(to_pyerr)?;
let generator =
tv::SnippetGenerator::create(&searcher.inner, query.get(), field)
.map_err(to_pyerr)?;
return Ok(SnippetGenerator {
field_name: field_name.to_string(),
inner: generator,
});
}
pub fn snippet_from_doc(&self, doc: &crate::Document) -> crate::Snippet {
let text: String = doc
.iter_values_for_field(&self.field_name)
.flat_map(tv::schema::Value::as_text)
.collect::<Vec<&str>>()
.join(" ");
let result = self.inner.snippet(&text);
Snippet { inner: result }
}
}

View File

@ -5,8 +5,8 @@ import datetime
import tantivy import tantivy
import pickle import pickle
import pytest import pytest
import tantivy
from tantivy import Document, Index, SchemaBuilder from tantivy import Document, Index, SchemaBuilder, SnippetGenerator
def schema(): def schema():
@ -784,3 +784,27 @@ def test_doc_address_pickle():
pickled = pickle.loads(pickle.dumps(orig)) pickled = pickle.loads(pickle.dumps(orig))
assert orig == pickled assert orig == pickled
class TestSnippets(object):
def test_document_snippet(self, dir_index):
index_dir, _ = dir_index
doc_schema = schema()
index = Index(doc_schema, str(index_dir))
query = index.parse_query("sea whale", ["title", "body"])
searcher = index.searcher()
result = searcher.search(query)
assert len(result.hits) == 1
snippet_generator = SnippetGenerator.create(searcher, query, doc_schema, "title")
for (score, doc_address) in result.hits:
doc = searcher.doc(doc_address)
snippet = snippet_generator.snippet_from_doc(doc)
highlights = snippet.highlighted()
assert len(highlights) == 1
first = highlights[0]
assert first.start == 20
assert first.end == 23
html_snippet = snippet.to_html()
assert html_snippet == 'The Old Man and the <b>Sea</b>'