From 03b1c89fa3189865cab2df51a0c2a861e6f43187 Mon Sep 17 00:00:00 2001 From: Tomoko Uchida Date: Sat, 4 May 2024 05:10:58 +0900 Subject: [PATCH] Expose Tantivy's PhraseQuery (#234) --- src/query.rs | 42 ++++++++++++++++++++++++++++++++++++++++++ tantivy/tantivy.pyi | 12 +++--------- tests/tantivy_test.py | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+), 9 deletions(-) diff --git a/src/query.rs b/src/query.rs index 1160f6a..e450779 100644 --- a/src/query.rs +++ b/src/query.rs @@ -157,6 +157,48 @@ impl Query { }) } + /// Construct a Tantivy's PhraseQuery with custom offsets and slop + /// + /// # Arguments + /// + /// * `schema` - Schema of the target index. + /// * `field_name` - Field name to be searched. + /// * `words` - Word list that constructs the phrase. A word can be a term text or a pair of term text and its offset in the phrase. + /// * `slop` - (Optional) The number of gaps permitted between the words in the query phrase. Default is 0. + #[staticmethod] + #[pyo3(signature = (schema, field_name, words, slop = 0))] + pub(crate) fn phrase_query( + schema: &Schema, + field_name: &str, + words: Vec<&PyAny>, + slop: u32, + ) -> PyResult { + let mut terms_with_offset = Vec::with_capacity(words.len()); + for (idx, word) in words.into_iter().enumerate() { + if let Ok((offset, value)) = word.extract() { + // Custom offset is provided. + let term = make_term(&schema.inner, field_name, value)?; + terms_with_offset.push((offset, term)); + } else { + // Custom offset is not provided. Use the list index as the offset. + let term = make_term(&schema.inner, field_name, word)?; + terms_with_offset.push((idx, term)); + }; + } + if terms_with_offset.is_empty() { + return Err(exceptions::PyValueError::new_err( + "words must not be empty.", + )); + } + let inner = tv::query::PhraseQuery::new_with_offset_and_slop( + terms_with_offset, + slop, + ); + Ok(Query { + inner: Box::new(inner), + }) + } + /// Construct a Tantivy's BooleanQuery #[staticmethod] #[pyo3(signature = (subqueries))] diff --git a/tantivy/tantivy.pyi b/tantivy/tantivy.pyi index 9325ae7..8eaeff0 100644 --- a/tantivy/tantivy.pyi +++ b/tantivy/tantivy.pyi @@ -1,6 +1,6 @@ import datetime from enum import Enum -from typing import Any, Optional, Sequence +from typing import Any, Optional, Sequence, Union class Schema: pass @@ -206,16 +206,10 @@ class Query: pass @staticmethod - def fuzzy_term_query( - schema: Schema, - field_name: str, - text: str, - distance: int = 1, - transposition_cost_one: bool = True, - prefix=False, - ) -> Query: + def phrase_query(schema: Schema, field_name: str, words: list[Union[str, tuple[int, str]]], slop: int = 0) -> Query: pass + @staticmethod def boolean_query(subqueries: Sequence[tuple[Occur, Query]]) -> Query: pass diff --git a/tests/tantivy_test.py b/tests/tantivy_test.py index bbb3219..4883674 100644 --- a/tests/tantivy_test.py +++ b/tests/tantivy_test.py @@ -801,6 +801,38 @@ class TestQuery(object): result = index.searcher().search(query, 10) assert len(result.hits) == 3 + def test_phrase_query(self, ram_index): + index = ram_index + searcher = index.searcher() + + query = Query.phrase_query(index.schema, "title", ["old", "man"]) + # should match the title "The Old Man and the Sea" + result = searcher.search(query, 10) + assert len(result.hits) == 1 + + query = Query.phrase_query(index.schema, "title", ["man", "old"]) + # sholdn't match any document + result = searcher.search(query, 10) + assert len(result.hits) == 0 + + query = Query.phrase_query(index.schema, "title", [(1, "man"), (0, "old")]) + # should match "The Old Man and the Sea" with the given offsets + result = searcher.search(query, 10) + assert len(result.hits) == 1 + + query = Query.phrase_query(index.schema, "title", ["man", "sea"]) + # sholdn't match any document with default slop 0. + result = searcher.search(query, 10) + assert len(result.hits) == 0 + + query = Query.phrase_query(index.schema, "title", ["man", "sea"], slop=2) + # should match the title "The Old Man and the Sea" with slop 2. + result = searcher.search(query, 10) + assert len(result.hits) == 1 + + with pytest.raises(ValueError, match = "words must not be empty."): + Query.phrase_query(index.schema, "title", []) + def test_fuzzy_term_query(self, ram_index): index = ram_index query = Query.fuzzy_term_query(index.schema, "title", "ice")