From f164b0ef4abb6884e5d03fcb1abd10df8fb187ee Mon Sep 17 00:00:00 2001 From: Caleb Hattingh Date: Wed, 13 Sep 2023 12:03:39 +0200 Subject: [PATCH] Snippet generator (fixes #36 and #63) (#122) Co-authored-by: Justin Greene Co-authored-by: Paul Masurel Co-authored-by: Pratyush Mittal Co-authored-by: mukeshsahnis --- .gitignore | 6 ++-- README.md | 15 ++++++++ src/document.rs | 2 +- src/lib.rs | 6 ++++ src/snippet.rs | 81 +++++++++++++++++++++++++++++++++++++++++++ tests/tantivy_test.py | 28 +++++++++++++-- 6 files changed, 132 insertions(+), 6 deletions(-) create mode 100644 src/snippet.rs diff --git a/.gitignore b/.gitignore index eba1fea..030c1f2 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,6 @@ tantivy.so tantivy.dylib tantivy/tantivy.cpython*.so tantivy.egg-info/ - -# Exclude the mkdocs site directory -site/ +.venv +.envrc +site/ \ No newline at end of file diff --git a/README.md b/README.md index 94af155..e97984d 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,21 @@ only supports python3. # Development +For compiling Python module: + +```bash +# create virtual env +python -m venv .venv +source .venv/bin/activate + +# install maturin, the build tool for PyO3 +pip install maturin + +# compile and install python module in venv +maturin develop +``` + + Setting up a development environment can be done in a virtual environment using [`nox`](https://nox.thea.codes) or using local packages using the provided `Makefile`. diff --git a/src/document.rs b/src/document.rs index ac08598..3adb13d 100644 --- a/src/document.rs +++ b/src/document.rs @@ -797,7 +797,7 @@ impl Document { Ok(()) } - fn iter_values_for_field<'a>( + pub fn iter_values_for_field<'a>( &'a self, field: &str, ) -> impl Iterator + 'a { diff --git a/src/lib.rs b/src/lib.rs index 245cfee..70ea2fa 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,13 +8,16 @@ mod query; mod schema; mod schemabuilder; mod searcher; +mod snippet; use document::Document; use facet::Facet; use index::Index; +use query::Query; use schema::Schema; use schemabuilder::SchemaBuilder; use searcher::{DocAddress, SearchResult, Searcher}; +use snippet::{Snippet, SnippetGenerator}; /// Python bindings for the search engine library Tantivy. /// @@ -76,6 +79,9 @@ fn tantivy(_py: Python, m: &PyModule) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; Ok(()) } diff --git a/src/snippet.rs b/src/snippet.rs new file mode 100644 index 0000000..e74b3ce --- /dev/null +++ b/src/snippet.rs @@ -0,0 +1,81 @@ +use crate::to_pyerr; +use pyo3::prelude::*; +use tantivy as tv; + +/// Tantivy schema. +/// +/// The schema is very strict. To build the schema the `SchemaBuilder` class is +/// provided. +#[pyclass] +pub(crate) struct Snippet { + pub(crate) inner: tv::Snippet, +} + +#[pyclass] +pub(crate) struct Range { + #[pyo3(get)] + start: usize, + #[pyo3(get)] + end: usize, +} + +#[pymethods] +impl Snippet { + pub fn to_html(&self) -> PyResult { + Ok(self.inner.to_html()) + } + + pub fn highlighted(&self) -> Vec { + let highlighted = self.inner.highlighted(); + let results = highlighted + .iter() + .map(|r| Range { + start: r.start, + end: r.end, + }) + .collect::>(); + results + } +} + +#[pyclass] +pub(crate) struct SnippetGenerator { + pub(crate) field_name: String, + pub(crate) inner: tv::SnippetGenerator, +} + +#[pymethods] +impl SnippetGenerator { + #[staticmethod] + pub fn create( + searcher: &crate::Searcher, + query: &crate::Query, + schema: &crate::Schema, + field_name: &str, + ) -> PyResult { + let field = schema + .inner + .get_field(field_name) + .or(Err("field not found")) + .map_err(to_pyerr)?; + let generator = + tv::SnippetGenerator::create(&searcher.inner, query.get(), field) + .map_err(to_pyerr)?; + + return Ok(SnippetGenerator { + field_name: field_name.to_string(), + inner: generator, + }); + } + + pub fn snippet_from_doc(&self, doc: &crate::Document) -> crate::Snippet { + let text: String = doc + .iter_values_for_field(&self.field_name) + .flat_map(tv::schema::Value::as_text) + .collect::>() + .join(" "); + + let result = self.inner.snippet(&text); + Snippet { inner: result } + } +} diff --git a/tests/tantivy_test.py b/tests/tantivy_test.py index a15fa30..6b1d7fd 100644 --- a/tests/tantivy_test.py +++ b/tests/tantivy_test.py @@ -5,8 +5,8 @@ import datetime import tantivy import pickle import pytest - -from tantivy import Document, Index, SchemaBuilder +import tantivy +from tantivy import Document, Index, SchemaBuilder, SnippetGenerator def schema(): @@ -784,3 +784,27 @@ def test_doc_address_pickle(): pickled = pickle.loads(pickle.dumps(orig)) assert orig == pickled + + +class TestSnippets(object): + def test_document_snippet(self, dir_index): + index_dir, _ = dir_index + doc_schema = schema() + index = Index(doc_schema, str(index_dir)) + query = index.parse_query("sea whale", ["title", "body"]) + searcher = index.searcher() + result = searcher.search(query) + assert len(result.hits) == 1 + + snippet_generator = SnippetGenerator.create(searcher, query, doc_schema, "title") + + for (score, doc_address) in result.hits: + doc = searcher.doc(doc_address) + snippet = snippet_generator.snippet_from_doc(doc) + highlights = snippet.highlighted() + assert len(highlights) == 1 + first = highlights[0] + assert first.start == 20 + assert first.end == 23 + html_snippet = snippet.to_html() + assert html_snippet == 'The Old Man and the Sea'