Tantivy 0.19.2 (#67)

* Adding __init__.py file to the tantivy folder to make maturin happy

Add Cargo.lock to the repo

Set the git-fetch-with-cli cargo flag so that we can override fetch settings

Renaming .cargo/config to .cargo/config.toml

Adding github-quiq-sh cargo registry

Point dependencies at our github-quiq-sh registry

Trying to resolve this build issue, pointing pyo3-build-config at our github-quiq-sh registry

SER-21487: Enable support for all standard Tantivy languages plus Chinese + Japanese in tantivy-py

SER-21487: Use uname rather than UNAME in the Makefile

SER-21487: Fix document date handling

SER-23013: Upgrade Tantivy and other dependencies

* Upgrade to Tantivy 0.19.1

* Apply rustfmt and fix bug when fast option = None

* Upgrade to tantivy-0.19.2

* Standardize around using 'cargo fmt' rather than 'rustfmt'

* Reverting to old style dependencies

* Linting with clippy

* Switching out hashmap for defining tokenizers for an array, and adding test for Spanish indexing

* Use cargo fmt instead of rustfmt on the Lint ci step
master
Phill Mell-Davies 2023-02-14 08:20:59 -05:00 committed by GitHub
parent b2043793ee
commit 164adc87e1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 1820 additions and 106 deletions

View File

@ -2,4 +2,4 @@
rustflags = [ rustflags = [
"-C", "link-arg=-undefined", "-C", "link-arg=-undefined",
"-C", "link-arg=dynamic_lookup", "-C", "link-arg=dynamic_lookup",
] ]

View File

@ -18,7 +18,7 @@ jobs:
toolchain: stable toolchain: stable
components: rustfmt components: rustfmt
- name: Check Formatting - name: Check Formatting
run: rustfmt --check src/*rs run: cargo fmt --check
Test: Test:
strategy: strategy:

2
.gitignore vendored
View File

@ -3,9 +3,9 @@
build build
/target /target
**/*.rs.bk **/*.rs.bk
Cargo.lock
dist/ dist/
__pycache__/ __pycache__/
tantivy.so tantivy.so
tantivy.dylib
tantivy/tantivy.cpython*.so tantivy/tantivy.cpython*.so
tantivy.egg-info/ tantivy.egg-info/

1573
Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
[package] [package]
name = "tantivy" name = "tantivy"
version = "0.17.0" version = "0.19.2"
readme = "README.md" readme = "README.md"
authors = ["Damir Jelić <poljar@termina.org.uk>"] authors = ["Damir Jelić <poljar@termina.org.uk>"]
edition = "2018" edition = "2018"
@ -11,15 +11,15 @@ name = "tantivy"
crate-type = ["cdylib"] crate-type = ["cdylib"]
[build-dependencies] [build-dependencies]
pyo3-build-config = "0.16.3" pyo3-build-config = "0.18.0"
[dependencies] [dependencies]
chrono = "0.4.19" chrono = "0.4.23"
tantivy = "0.17" tantivy = "0.19.2"
itertools = "0.10.3" itertools = "0.10.5"
futures = "0.3.21" futures = "0.3.26"
serde_json = "1.0.64" serde_json = "1.0.91"
[dependencies.pyo3] [dependencies.pyo3]
version = "0.16.3" version = "0.18.0"
features = ["extension-module"] features = ["extension-module"]

View File

@ -1,4 +1,4 @@
ifeq ($(shell UNAME),Darwin) ifeq ($(shell uname),Darwin)
EXT := dylib EXT := dylib
else else
EXT := so EXT := so
@ -6,15 +6,21 @@ endif
source_files := $(wildcard src/*.rs) source_files := $(wildcard src/*.rs)
all: tantivy/tantivy.$(EXT) all: format lint build test
PHONY: test format PHONY: test format
lint:
cargo clippy
test: tantivy/tantivy.$(EXT) test: tantivy/tantivy.$(EXT)
python3 -m pytest python3 -m pytest
format: format:
rustfmt src/*.rs cargo fmt
build:
maturin build --interpreter python3.7 python3.8 python3.9 python3.10 python3.11
tantivy/tantivy.$(EXT): target/debug/libtantivy.$(EXT) tantivy/tantivy.$(EXT): target/debug/libtantivy.$(EXT)
cp target/debug/libtantivy.$(EXT) tantivy/tantivy.so cp target/debug/libtantivy.$(EXT) tantivy/tantivy.so

View File

@ -1,5 +1,5 @@
[build-system] [build-system]
requires = ["maturin"] requires = ["maturin>=0.13,<0.14"]
build-backend = "maturin" build-backend = "maturin"
[project] [project]

View File

@ -1 +1 @@
max_width = 80 max_width = 80

View File

@ -9,7 +9,7 @@ use pyo3::{
}, },
}; };
use chrono::{offset::TimeZone, Datelike, Timelike, Utc}; use chrono::{offset::TimeZone, Utc};
use tantivy as tv; use tantivy as tv;
@ -56,37 +56,42 @@ fn value_to_py(py: Python, value: &Value) -> PyResult<PyObject> {
// TODO implement me // TODO implement me
unimplemented!(); unimplemented!();
} }
Value::Date(d) => PyDateTime::new( Value::Date(d) => {
py, let utc = d.into_utc();
d.year(), PyDateTime::new(
d.month() as u8, py,
d.day() as u8, utc.year(),
d.hour() as u8, utc.month() as u8,
d.minute() as u8, utc.day(),
d.second() as u8, utc.hour(),
d.timestamp_subsec_micros(), utc.minute(),
None, utc.second(),
)? utc.microsecond(),
.into_py(py), None,
)?
.into_py(py)
}
Value::Facet(f) => Facet { inner: f.clone() }.into_py(py), Value::Facet(f) => Facet { inner: f.clone() }.into_py(py),
Value::JsonObject(json_object) => { Value::JsonObject(json_object) => {
let inner: HashMap<_, _> = json_object let inner: HashMap<_, _> = json_object
.iter() .iter()
.map(|(k, v)| (k, value_to_object(&v, py))) .map(|(k, v)| (k, value_to_object(v, py)))
.collect(); .collect();
inner.to_object(py) inner.to_object(py)
} }
Value::Bool(b) => b.into_py(py),
Value::IpAddr(i) => (*i).to_string().into_py(py),
}) })
} }
fn value_to_string(value: &Value) -> String { fn value_to_string(value: &Value) -> String {
match value { match value {
Value::Str(text) => text.clone(), Value::Str(text) => text.clone(),
Value::U64(num) => format!("{}", num), Value::U64(num) => format!("{num}"),
Value::I64(num) => format!("{}", num), Value::I64(num) => format!("{num}"),
Value::F64(num) => format!("{}", num), Value::F64(num) => format!("{num}"),
Value::Bytes(bytes) => format!("{:?}", bytes), Value::Bytes(bytes) => format!("{bytes:?}"),
Value::Date(d) => format!("{:?}", d), Value::Date(d) => format!("{d:?}"),
Value::Facet(facet) => facet.to_string(), Value::Facet(facet) => facet.to_string(),
Value::PreTokStr(_pretok) => { Value::PreTokStr(_pretok) => {
// TODO implement me // TODO implement me
@ -95,6 +100,8 @@ fn value_to_string(value: &Value) -> String {
Value::JsonObject(json_object) => { Value::JsonObject(json_object) => {
serde_json::to_string(&json_object).unwrap() serde_json::to_string(&json_object).unwrap()
} }
Value::Bool(b) => format!("{b}"),
Value::IpAddr(i) => format!("{}", *i),
} }
} }
@ -141,10 +148,10 @@ impl fmt::Debug for Document {
.chars() .chars()
.take(10) .take(10)
.collect(); .collect();
format!("{}=[{}]", field_name, values_str) format!("{field_name}=[{values_str}]")
}) })
.join(","); .join(",");
write!(f, "Document({})", doc_str) write!(f, "Document({doc_str})")
} }
} }
@ -170,23 +177,24 @@ pub(crate) fn extract_value(any: &PyAny) -> PyResult<Value> {
} }
if let Ok(py_datetime) = any.downcast::<PyDateTime>() { if let Ok(py_datetime) = any.downcast::<PyDateTime>() {
let datetime = Utc let datetime = Utc
.ymd( .with_ymd_and_hms(
py_datetime.get_year(), py_datetime.get_year(),
py_datetime.get_month().into(), py_datetime.get_month().into(),
py_datetime.get_day().into(), py_datetime.get_day().into(),
)
.and_hms_micro(
py_datetime.get_hour().into(), py_datetime.get_hour().into(),
py_datetime.get_minute().into(), py_datetime.get_minute().into(),
py_datetime.get_second().into(), py_datetime.get_second().into(),
py_datetime.get_microsecond(), )
); .single()
return Ok(Value::Date(datetime)); .unwrap();
return Ok(Value::Date(tv::DateTime::from_timestamp_secs(
datetime.timestamp(),
)));
} }
if let Ok(facet) = any.extract::<Facet>() { if let Ok(facet) = any.extract::<Facet>() {
return Ok(Value::Facet(facet.inner.clone())); return Ok(Value::Facet(facet.inner));
} }
Err(to_pyerr(format!("Value unsupported {:?}", any))) Err(to_pyerr(format!("Value unsupported {any:?}")))
} }
fn extract_value_single_or_list(any: &PyAny) -> PyResult<Vec<Value>> { fn extract_value_single_or_list(any: &PyAny) -> PyResult<Vec<Value>> {
@ -200,7 +208,7 @@ fn extract_value_single_or_list(any: &PyAny) -> PyResult<Vec<Value>> {
#[pymethods] #[pymethods]
impl Document { impl Document {
#[new] #[new]
#[args(kwargs = "**")] #[pyo3(signature = (**kwargs))]
fn new(kwargs: Option<&PyDict>) -> PyResult<Self> { fn new(kwargs: Option<&PyDict>) -> PyResult<Self> {
let mut document = Document::default(); let mut document = Document::default();
if let Some(field_dict) = kwargs { if let Some(field_dict) = kwargs {
@ -299,18 +307,21 @@ impl Document {
/// value (datetime): The date that will be added to the document. /// value (datetime): The date that will be added to the document.
fn add_date(&mut self, field_name: String, value: &PyDateTime) { fn add_date(&mut self, field_name: String, value: &PyDateTime) {
let datetime = Utc let datetime = Utc
.ymd( .with_ymd_and_hms(
value.get_year(), value.get_year(),
value.get_month().into(), value.get_month().into(),
value.get_day().into(), value.get_day().into(),
)
.and_hms_micro(
value.get_hour().into(), value.get_hour().into(),
value.get_minute().into(), value.get_minute().into(),
value.get_second().into(), value.get_second().into(),
value.get_microsecond(), )
); .single()
add_value(self, field_name, datetime); .unwrap();
add_value(
self,
field_name,
tv::DateTime::from_timestamp_secs(datetime.timestamp()),
);
} }
/// Add a facet value to the document. /// Add a facet value to the document.
@ -387,13 +398,13 @@ impl Document {
} }
fn __getitem__(&self, field_name: &str) -> PyResult<Vec<PyObject>> { fn __getitem__(&self, field_name: &str) -> PyResult<Vec<PyObject>> {
let gil = Python::acquire_gil(); Python::with_gil(|py| -> PyResult<Vec<PyObject>> {
let py = gil.python(); self.get_all(py, field_name)
self.get_all(py, field_name) })
} }
fn __repr__(&self) -> PyResult<String> { fn __repr__(&self) -> PyResult<String> {
Ok(format!("{:?}", self)) Ok(format!("{self:?}"))
} }
} }

View File

@ -14,6 +14,10 @@ use tantivy as tv;
use tantivy::{ use tantivy::{
directory::MmapDirectory, directory::MmapDirectory,
schema::{NamedFieldDocument, Term, Value}, schema::{NamedFieldDocument, Term, Value},
tokenizer::{
Language, LowerCaser, RemoveLongFilter, SimpleTokenizer, Stemmer,
TextAnalyzer,
},
}; };
const RELOAD_POLICY: &str = "commit"; const RELOAD_POLICY: &str = "commit";
@ -120,26 +124,25 @@ impl IndexWriter {
Value::U64(num) => Term::from_field_u64(field, num), Value::U64(num) => Term::from_field_u64(field, num),
Value::I64(num) => Term::from_field_i64(field, num), Value::I64(num) => Term::from_field_i64(field, num),
Value::F64(num) => Term::from_field_f64(field, num), Value::F64(num) => Term::from_field_f64(field, num),
Value::Date(d) => Term::from_field_date(field, &d), Value::Date(d) => Term::from_field_date(field, d),
Value::Facet(facet) => Term::from_facet(field, &facet), Value::Facet(facet) => Term::from_facet(field, &facet),
Value::Bytes(_) => { Value::Bytes(_) => {
return Err(exceptions::PyValueError::new_err(format!( return Err(exceptions::PyValueError::new_err(format!(
"Field `{}` is bytes type not deletable.", "Field `{field_name}` is bytes type not deletable."
field_name
))) )))
} }
Value::PreTokStr(_pretok) => { Value::PreTokStr(_pretok) => {
return Err(exceptions::PyValueError::new_err(format!( return Err(exceptions::PyValueError::new_err(format!(
"Field `{}` is pretokenized. This is not authorized for delete.", "Field `{field_name}` is pretokenized. This is not authorized for delete."
field_name
))) )))
} }
Value::JsonObject(_) => { Value::JsonObject(_) => {
return Err(exceptions::PyValueError::new_err(format!( return Err(exceptions::PyValueError::new_err(format!(
"Field `{}` is json object type not deletable.", "Field `{field_name}` is json object type not deletable."
field_name
))) )))
} },
Value::Bool(b) => Term::from_field_bool(field, b),
Value::IpAddr(i) => Term::from_field_ip_addr(field, i)
}; };
Ok(self.inner_index_writer.delete_term(term)) Ok(self.inner_index_writer.delete_term(term))
} }
@ -167,12 +170,15 @@ impl Index {
#[staticmethod] #[staticmethod]
fn open(path: &str) -> PyResult<Index> { fn open(path: &str) -> PyResult<Index> {
let index = tv::Index::open_in_dir(path).map_err(to_pyerr)?; let index = tv::Index::open_in_dir(path).map_err(to_pyerr)?;
Index::register_custom_text_analyzers(&index);
let reader = index.reader().map_err(to_pyerr)?; let reader = index.reader().map_err(to_pyerr)?;
Ok(Index { index, reader }) Ok(Index { index, reader })
} }
#[new] #[new]
#[args(reuse = true)] #[pyo3(signature = (schema, path = None, reuse = true))]
fn new(schema: &Schema, path: Option<&str>, reuse: bool) -> PyResult<Self> { fn new(schema: &Schema, path: Option<&str>, reuse: bool) -> PyResult<Self> {
let index = match path { let index = match path {
Some(p) => { Some(p) => {
@ -191,6 +197,8 @@ impl Index {
None => tv::Index::create_in_ram(schema.inner.clone()), None => tv::Index::create_in_ram(schema.inner.clone()),
}; };
Index::register_custom_text_analyzers(&index);
let reader = index.reader().map_err(to_pyerr)?; let reader = index.reader().map_err(to_pyerr)?;
Ok(Index { index, reader }) Ok(Index { index, reader })
} }
@ -208,7 +216,7 @@ impl Index {
/// automatically the number of threads. /// automatically the number of threads.
/// ///
/// Raises ValueError if there was an error while creating the writer. /// Raises ValueError if there was an error while creating the writer.
#[args(heap_size = 3000000, num_threads = 0)] #[pyo3(signature = (heap_size = 3000000, num_threads = 0))]
fn writer( fn writer(
&self, &self,
heap_size: usize, heap_size: usize,
@ -231,13 +239,13 @@ impl Index {
/// Args: /// Args:
/// reload_policy (str, optional): The reload policy that the /// reload_policy (str, optional): The reload policy that the
/// IndexReader should use. Can be `Manual` or `OnCommit`. /// IndexReader should use. Can be `Manual` or `OnCommit`.
/// num_searchers (int, optional): The number of searchers that the /// num_warmers (int, optional): The number of searchers that the
/// reader should create. /// reader should create.
#[args(reload_policy = "RELOAD_POLICY", num_searchers = 0)] #[pyo3(signature = (reload_policy = RELOAD_POLICY, num_warmers = 0))]
fn config_reader( fn config_reader(
&mut self, &mut self,
reload_policy: &str, reload_policy: &str,
num_searchers: usize, num_warmers: usize,
) -> Result<(), PyErr> { ) -> Result<(), PyErr> {
let reload_policy = reload_policy.to_lowercase(); let reload_policy = reload_policy.to_lowercase();
let reload_policy = match reload_policy.as_ref() { let reload_policy = match reload_policy.as_ref() {
@ -251,8 +259,8 @@ impl Index {
}; };
let builder = self.index.reader_builder(); let builder = self.index.reader_builder();
let builder = builder.reload_policy(reload_policy); let builder = builder.reload_policy(reload_policy);
let builder = if num_searchers > 0 { let builder = if num_warmers > 0 {
builder.num_searchers(num_searchers) builder.num_warming_threads(num_warmers)
} else { } else {
builder builder
}; };
@ -313,7 +321,7 @@ impl Index {
/// default_fields_names (List[Field]): A list of fields used to search if no /// default_fields_names (List[Field]): A list of fields used to search if no
/// field is specified in the query. /// field is specified in the query.
/// ///
#[args(reload_policy = "RELOAD_POLICY")] #[pyo3(signature = (query, default_field_names = None))]
pub fn parse_query( pub fn parse_query(
&self, &self,
query: &str, query: &str,
@ -328,16 +336,14 @@ impl Index {
if !field_entry.is_indexed() { if !field_entry.is_indexed() {
return Err(exceptions::PyValueError::new_err( return Err(exceptions::PyValueError::new_err(
format!( format!(
"Field `{}` is not set as indexed in the schema.", "Field `{default_field_name}` is not set as indexed in the schema."
default_field_name
), ),
)); ));
} }
default_fields.push(field); default_fields.push(field);
} else { } else {
return Err(exceptions::PyValueError::new_err(format!( return Err(exceptions::PyValueError::new_err(format!(
"Field `{}` is not defined in the schema.", "Field `{default_field_name}` is not defined in the schema."
default_field_name
))); )));
} }
} }
@ -355,3 +361,35 @@ impl Index {
Ok(Query { inner: query }) Ok(Query { inner: query })
} }
} }
impl Index {
fn register_custom_text_analyzers(index: &tv::Index) {
let analyzers = [
("ar_stem", Language::Arabic),
("da_stem", Language::Danish),
("nl_stem", Language::Dutch),
("fi_stem", Language::Finnish),
("fr_stem", Language::French),
("de_stem", Language::German),
("el_stem", Language::Greek),
("hu_stem", Language::Hungarian),
("it_stem", Language::Italian),
("no_stem", Language::Norwegian),
("pt_stem", Language::Portuguese),
("ro_stem", Language::Romanian),
("ru_stem", Language::Russian),
("es_stem", Language::Spanish),
("sv_stem", Language::Swedish),
("ta_stem", Language::Tamil),
("tr_stem", Language::Turkish),
];
for (name, lang) in &analyzers {
let an = TextAnalyzer::from(SimpleTokenizer)
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(Stemmer::new(*lang));
index.tokenizers().register(name, an);
}
}
}

View File

@ -1,5 +1,5 @@
use ::tantivy as tv;
use pyo3::{exceptions, prelude::*}; use pyo3::{exceptions, prelude::*};
use tantivy as tv;
mod document; mod document;
mod facet; mod facet;
@ -88,8 +88,7 @@ pub(crate) fn get_field(
) -> PyResult<tv::schema::Field> { ) -> PyResult<tv::schema::Field> {
let field = schema.get_field(field_name).ok_or_else(|| { let field = schema.get_field(field_name).ok_or_else(|| {
exceptions::PyValueError::new_err(format!( exceptions::PyValueError::new_err(format!(
"Field `{}` is not defined in the schema.", "Field `{field_name}` is not defined in the schema."
field_name
)) ))
})?; })?;

View File

@ -6,7 +6,7 @@ use tantivy::schema;
use crate::schema::Schema; use crate::schema::Schema;
use std::sync::{Arc, RwLock}; use std::sync::{Arc, RwLock};
use tantivy::schema::INDEXED; use tantivy::schema::{DateOptions, INDEXED};
/// Tantivy has a very strict schema. /// Tantivy has a very strict schema.
/// You need to specify in advance whether a field is indexed or not, /// You need to specify in advance whether a field is indexed or not,
@ -60,11 +60,12 @@ impl SchemaBuilder {
/// ///
/// Returns the associated field handle. /// Returns the associated field handle.
/// Raises a ValueError if there was an error with the field creation. /// Raises a ValueError if there was an error with the field creation.
#[args( #[pyo3(signature = (
name,
stored = false, stored = false,
tokenizer_name = "TOKENIZER", tokenizer_name = TOKENIZER,
index_option = "RECORD" index_option = RECORD
)] ))]
fn add_text_field( fn add_text_field(
&mut self, &mut self,
name: &str, name: &str,
@ -109,7 +110,7 @@ impl SchemaBuilder {
/// ///
/// Returns the associated field handle. /// Returns the associated field handle.
/// Raises a ValueError if there was an error with the field creation. /// Raises a ValueError if there was an error with the field creation.
#[args(stored = false, indexed = false)] #[pyo3(signature = (name, stored = false, indexed = false, fast = None))]
fn add_integer_field( fn add_integer_field(
&mut self, &mut self,
name: &str, name: &str,
@ -151,7 +152,7 @@ impl SchemaBuilder {
/// ///
/// Returns the associated field handle. /// Returns the associated field handle.
/// Raises a ValueError if there was an error with the field creation. /// Raises a ValueError if there was an error with the field creation.
#[args(stored = false, indexed = false)] #[pyo3(signature = (name, stored = false, indexed = false, fast = None))]
fn add_unsigned_field( fn add_unsigned_field(
&mut self, &mut self,
name: &str, name: &str,
@ -185,7 +186,7 @@ impl SchemaBuilder {
/// field. Fast fields are designed for random access. Access time /// field. Fast fields are designed for random access. Access time
/// are similar to a random lookup in an array. If more than one /// are similar to a random lookup in an array. If more than one
/// value is associated to a fast field, only the last one is kept. /// value is associated to a fast field, only the last one is kept.
/// Can be one of 'single' or 'multi'. If this is set to 'single, /// Can be one of 'single' or 'multi'. If this is set to 'single',
/// the document must have exactly one value associated to the /// the document must have exactly one value associated to the
/// document. If this is set to 'multi', the document can have any /// document. If this is set to 'multi', the document can have any
/// number of values associated to the document. Defaults to None, /// number of values associated to the document. Defaults to None,
@ -193,7 +194,7 @@ impl SchemaBuilder {
/// ///
/// Returns the associated field handle. /// Returns the associated field handle.
/// Raises a ValueError if there was an error with the field creation. /// Raises a ValueError if there was an error with the field creation.
#[args(stored = false, indexed = false)] #[pyo3(signature = (name, stored = false, indexed = false, fast = None))]
fn add_date_field( fn add_date_field(
&mut self, &mut self,
name: &str, name: &str,
@ -203,7 +204,29 @@ impl SchemaBuilder {
) -> PyResult<Self> { ) -> PyResult<Self> {
let builder = &mut self.builder; let builder = &mut self.builder;
let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?; let mut opts = DateOptions::default();
if stored {
opts = opts.set_stored();
}
if indexed {
opts = opts.set_indexed();
}
let fast = match fast {
Some(f) => {
let f = f.to_lowercase();
match f.as_ref() {
"single" => Some(schema::Cardinality::SingleValue),
"multi" => Some(schema::Cardinality::MultiValues),
_ => return Err(exceptions::PyValueError::new_err(
"Invalid index option, valid choices are: 'multi' and 'single'"
)),
}
}
None => None,
};
if let Some(f) = fast {
opts = opts.set_fast(f);
}
if let Some(builder) = builder.write().unwrap().as_mut() { if let Some(builder) = builder.write().unwrap().as_mut() {
builder.add_date_field(name, opts); builder.add_date_field(name, opts);
@ -234,11 +257,12 @@ impl SchemaBuilder {
/// ///
/// Returns the associated field handle. /// Returns the associated field handle.
/// Raises a ValueError if there was an error with the field creation. /// Raises a ValueError if there was an error with the field creation.
#[args( #[pyo3(signature = (
name,
stored = false, stored = false,
tokenizer_name = "TOKENIZER", tokenizer_name = TOKENIZER,
index_option = "RECORD" index_option = RECORD
)] ))]
fn add_json_field( fn add_json_field(
&mut self, &mut self,
name: &str, name: &str,

View File

@ -10,7 +10,7 @@ use tantivy::collector::{Count, MultiCollector, TopDocs};
/// A Searcher is used to search the index given a prepared Query. /// A Searcher is used to search the index given a prepared Query.
#[pyclass] #[pyclass]
pub(crate) struct Searcher { pub(crate) struct Searcher {
pub(crate) inner: tv::LeasedItem<tv::Searcher>, pub(crate) inner: tv::Searcher,
} }
#[derive(Clone)] #[derive(Clone)]
@ -22,8 +22,8 @@ enum Fruit {
impl std::fmt::Debug for Fruit { impl std::fmt::Debug for Fruit {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self { match self {
Fruit::Score(s) => f.write_str(&format!("{}", s)), Fruit::Score(s) => f.write_str(&format!("{s}")),
Fruit::Order(o) => f.write_str(&format!("{}", o)), Fruit::Order(o) => f.write_str(&format!("{o}")),
} }
} }
} }
@ -93,7 +93,7 @@ impl Searcher {
/// Returns `SearchResult` object. /// Returns `SearchResult` object.
/// ///
/// Raises a ValueError if there was an error with the search. /// Raises a ValueError if there was an error with the search.
#[args(limit = 10, offset = 0, count = true)] #[pyo3(signature = (query, limit = 10, count = true, order_by_field = None, offset = 0))]
fn search( fn search(
&self, &self,
_py: Python, _py: Python,
@ -154,10 +154,7 @@ impl Searcher {
} }
}; };
let count = match count_handle { let count = count_handle.map(|h| h.extract(&mut multifruit));
Some(h) => Some(h.extract(&mut multifruit)),
None => None,
};
Ok(SearchResult { hits, count }) Ok(SearchResult { hits, count })
} }
@ -230,11 +227,11 @@ impl From<&tv::DocAddress> for DocAddress {
} }
} }
impl Into<tv::DocAddress> for &DocAddress { impl From<&DocAddress> for tv::DocAddress {
fn into(self) -> tv::DocAddress { fn from(val: &DocAddress) -> Self {
tv::DocAddress { tv::DocAddress {
segment_ord: self.segment_ord(), segment_ord: val.segment_ord(),
doc_id: self.doc(), doc_id: val.doc(),
} }
} }
} }

1
tantivy/__init__.py Normal file
View File

@ -0,0 +1 @@
from .tantivy import *

View File

@ -67,6 +67,56 @@ def create_index(dir=None):
return index return index
def spanish_schema():
return (
SchemaBuilder()
.add_text_field("title", stored=True, tokenizer_name='es_stem')
.add_text_field("body", tokenizer_name='es_stem')
.build()
)
def create_spanish_index():
# assume all tests will use the same documents for now
# other methods may set up function-local indexes
index = Index(spanish_schema(), None)
writer = index.writer()
# 2 ways of adding documents
# 1
doc = Document()
# create a document instance
# add field-value pairs
doc.add_text("title", "El viejo y el mar")
doc.add_text(
"body",
(
"Era un viejo que pescaba solo en un bote en el Gulf Stream y hacía ochenta y cuatro días que no cogía un pez. "
),
)
writer.add_document(doc)
# 2 use the built-in json support
# keys need to coincide with field names
doc = Document.from_dict(
{
"title": "De ratones y hombres",
"body": (
"Unas millas al sur de Soledad, el río Salinas se ahonda junto al margen de la ladera y fluye profundo y verde. Es tibia el agua, porque se ha deslizado chispeante sobre la arena amarilla y al calor del sol antes de llegar a la angosta laguna. A un lado del río, la dorada falda de la ladera se curva hacia arriba trepando hasta las montañas Gabilán, fuertes y rocosas, pero del lado del valle los árboles bordean la orilla: sauces frescos y verdes cada primavera, que en la s junturas más bajas de sus hojas muestran las consecuencias de la crecida invernal; y sicomoros de troncos veteados, blancos, recostados, y ramas quesear quean sobre el estanque"
),
}
)
writer.add_document(doc)
writer.add_json(
"""{
"title": ["Frankenstein", "El moderno Prometeo"],
"body": "Te alegrará saber que no ha ocurrido ningún percance al principio de una aventura que siempre consideraste cargada de malos presagios. Llegué aquí ayer, y mi primera tarea es asegurarle a mi querida hermana que me hallo perfectamente y que tengo una gran confianza en el éxito de mi empresa."
}"""
)
writer.commit()
index.reload()
return index
@pytest.fixture() @pytest.fixture()
def dir_index(tmpdir): def dir_index(tmpdir):
return (tmpdir, create_index(str(tmpdir))) return (tmpdir, create_index(str(tmpdir)))
@ -77,6 +127,11 @@ def ram_index():
return create_index() return create_index()
@pytest.fixture(scope="class")
def spanish_index():
return create_spanish_index()
class TestClass(object): class TestClass(object):
def test_simple_search_in_dir(self, dir_index): def test_simple_search_in_dir(self, dir_index):
_, index = dir_index _, index = dir_index
@ -103,6 +158,16 @@ class TestClass(object):
searched_doc = index.searcher().doc(doc_address) searched_doc = index.searcher().doc(doc_address)
assert searched_doc["title"] == ["The Old Man and the Sea"] assert searched_doc["title"] == ["The Old Man and the Sea"]
def test_simple_search_in_spanish(self, spanish_index):
index = spanish_index
query = index.parse_query("vieja", ["title", "body"])
result = index.searcher().search(query, 10)
assert len(result.hits) == 1
_, doc_address = result.hits[0]
search_doc = index.searcher().doc(doc_address)
assert search_doc["title"] == ["El viejo y el mar"]
def test_and_query(self, ram_index): def test_and_query(self, ram_index):
index = ram_index index = ram_index
query = index.parse_query( query = index.parse_query(