Expose Tantivy's MoreLikeThisQuery (#257)
parent
03b1c89fa3
commit
9fafdf25cb
50
src/query.rs
50
src/query.rs
|
@ -1,4 +1,4 @@
|
|||
use crate::{get_field, make_term, to_pyerr, Schema};
|
||||
use crate::{get_field, make_term, to_pyerr, DocAddress, Schema};
|
||||
use pyo3::{
|
||||
exceptions,
|
||||
prelude::*,
|
||||
|
@ -100,7 +100,7 @@ impl Query {
|
|||
let terms = field_values
|
||||
.into_iter()
|
||||
.map(|field_value| {
|
||||
make_term(&schema.inner, field_name, &field_value)
|
||||
make_term(&schema.inner, field_name, field_value)
|
||||
})
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
let inner = tv::query::TermSetQuery::new(terms);
|
||||
|
@ -138,7 +138,7 @@ impl Query {
|
|||
transposition_cost_one: bool,
|
||||
prefix: bool,
|
||||
) -> PyResult<Query> {
|
||||
let term = make_term(&schema.inner, field_name, &text)?;
|
||||
let term = make_term(&schema.inner, field_name, text)?;
|
||||
let inner = if prefix {
|
||||
tv::query::FuzzyTermQuery::new_prefix(
|
||||
term,
|
||||
|
@ -272,6 +272,50 @@ impl Query {
|
|||
}
|
||||
}
|
||||
|
||||
#[staticmethod]
|
||||
#[pyo3(signature = (doc_address, min_doc_frequency = Some(5), max_doc_frequency = None, min_term_frequency = Some(2), max_query_terms = Some(25), min_word_length = None, max_word_length = None, boost_factor = Some(1.0), stop_words = vec![]))]
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(crate) fn more_like_this_query(
|
||||
doc_address: &DocAddress,
|
||||
min_doc_frequency: Option<u64>,
|
||||
max_doc_frequency: Option<u64>,
|
||||
min_term_frequency: Option<usize>,
|
||||
max_query_terms: Option<usize>,
|
||||
min_word_length: Option<usize>,
|
||||
max_word_length: Option<usize>,
|
||||
boost_factor: Option<f32>,
|
||||
stop_words: Vec<String>,
|
||||
) -> PyResult<Query> {
|
||||
let mut builder = tv::query::MoreLikeThisQuery::builder();
|
||||
if let Some(value) = min_doc_frequency {
|
||||
builder = builder.with_min_doc_frequency(value);
|
||||
}
|
||||
if let Some(value) = max_doc_frequency {
|
||||
builder = builder.with_max_doc_frequency(value);
|
||||
}
|
||||
if let Some(value) = min_term_frequency {
|
||||
builder = builder.with_min_term_frequency(value);
|
||||
}
|
||||
if let Some(value) = max_query_terms {
|
||||
builder = builder.with_max_query_terms(value);
|
||||
}
|
||||
if let Some(value) = min_word_length {
|
||||
builder = builder.with_min_word_length(value);
|
||||
}
|
||||
if let Some(value) = max_word_length {
|
||||
builder = builder.with_max_word_length(value);
|
||||
}
|
||||
if let Some(value) = boost_factor {
|
||||
builder = builder.with_boost_factor(value);
|
||||
}
|
||||
builder = builder.with_stop_words(stop_words);
|
||||
|
||||
let inner = builder.with_document(tv::DocAddress::from(doc_address));
|
||||
Ok(Query {
|
||||
inner: Box::new(inner),
|
||||
})
|
||||
}
|
||||
|
||||
/// Construct a Tantivy's ConstScoreQuery
|
||||
#[staticmethod]
|
||||
#[pyo3(signature = (query, score))]
|
||||
|
|
|
@ -153,6 +153,7 @@ impl Searcher {
|
|||
///
|
||||
/// Raises a ValueError if there was an error with the search.
|
||||
#[pyo3(signature = (query, limit = 10, count = true, order_by_field = None, offset = 0, order = Order::Desc))]
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn search(
|
||||
&self,
|
||||
py: Python,
|
||||
|
|
|
@ -227,6 +227,20 @@ class Query:
|
|||
def regex_query(schema: Schema, field_name: str, regex_pattern: str) -> Query:
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def more_like_this_query(
|
||||
doc_address: DocAddress,
|
||||
min_doc_frequency: Optional[int] = 5,
|
||||
max_doc_frequency: Optional[int] = None,
|
||||
min_term_frequency: Optional[int] = 2,
|
||||
max_query_terms: Optional[int] = 25,
|
||||
min_word_length: Optional[int] = None,
|
||||
max_word_length: Optional[int] = None,
|
||||
boost_factor: Optional[float] = 1.0,
|
||||
stop_words: list[str] = []
|
||||
) -> Query:
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def const_score_query(query: Query, score: float) -> Query:
|
||||
pass
|
||||
|
|
|
@ -1090,6 +1090,42 @@ class TestQuery(object):
|
|||
):
|
||||
Query.regex_query(index.schema, "body", "fish(")
|
||||
|
||||
def test_more_like_this_query(self, ram_index):
|
||||
index = ram_index
|
||||
|
||||
# first, search the target doc
|
||||
query = Query.term_query(index.schema, "title", "man")
|
||||
result = index.searcher().search(query, 1)
|
||||
_, doc_address = result.hits[0]
|
||||
searched_doc = index.searcher().doc(doc_address)
|
||||
assert searched_doc["title"] == ["The Old Man and the Sea"]
|
||||
|
||||
# construct the default MLT Query
|
||||
mlt_query = Query.more_like_this_query(doc_address)
|
||||
assert (
|
||||
repr(mlt_query)
|
||||
== "Query(MoreLikeThisQuery { mlt: MoreLikeThis { min_doc_frequency: Some(5), max_doc_frequency: None, min_term_frequency: Some(2), max_query_terms: Some(25), min_word_length: None, max_word_length: None, boost_factor: Some(1.0), stop_words: [] }, target: DocumentAdress(DocAddress { segment_ord: 0, doc_id: 0 }) })"
|
||||
)
|
||||
result = index.searcher().search(mlt_query, 10)
|
||||
assert len(result.hits) == 0
|
||||
|
||||
# construct a fine-tuned MLT Query
|
||||
mlt_query = Query.more_like_this_query(
|
||||
doc_address,
|
||||
min_doc_frequency=2,
|
||||
max_doc_frequency=10,
|
||||
min_term_frequency=1,
|
||||
max_query_terms=10,
|
||||
min_word_length=2,
|
||||
max_word_length=20,
|
||||
boost_factor=2.0,
|
||||
stop_words=["fish"])
|
||||
assert (
|
||||
repr(mlt_query)
|
||||
== "Query(MoreLikeThisQuery { mlt: MoreLikeThis { min_doc_frequency: Some(2), max_doc_frequency: Some(10), min_term_frequency: Some(1), max_query_terms: Some(10), min_word_length: Some(2), max_word_length: Some(20), boost_factor: Some(2.0), stop_words: [\"fish\"] }, target: DocumentAdress(DocAddress { segment_ord: 0, doc_id: 0 }) })"
|
||||
)
|
||||
result = index.searcher().search(mlt_query, 10)
|
||||
assert len(result.hits) > 0
|
||||
def test_const_score_query(self, ram_index):
|
||||
index = ram_index
|
||||
|
||||
|
|
Loading…
Reference in New Issue