Tantivy 0.19.2 (#67)
* Adding __init__.py file to the tantivy folder to make maturin happy Add Cargo.lock to the repo Set the git-fetch-with-cli cargo flag so that we can override fetch settings Renaming .cargo/config to .cargo/config.toml Adding github-quiq-sh cargo registry Point dependencies at our github-quiq-sh registry Trying to resolve this build issue, pointing pyo3-build-config at our github-quiq-sh registry SER-21487: Enable support for all standard Tantivy languages plus Chinese + Japanese in tantivy-py SER-21487: Use uname rather than UNAME in the Makefile SER-21487: Fix document date handling SER-23013: Upgrade Tantivy and other dependencies * Upgrade to Tantivy 0.19.1 * Apply rustfmt and fix bug when fast option = None * Upgrade to tantivy-0.19.2 * Standardize around using 'cargo fmt' rather than 'rustfmt' * Reverting to old style dependencies * Linting with clippy * Switching out hashmap for defining tokenizers for an array, and adding test for Spanish indexing * Use cargo fmt instead of rustfmt on the Lint ci stepmaster
parent
b2043793ee
commit
164adc87e1
|
@ -18,7 +18,7 @@ jobs:
|
||||||
toolchain: stable
|
toolchain: stable
|
||||||
components: rustfmt
|
components: rustfmt
|
||||||
- name: Check Formatting
|
- name: Check Formatting
|
||||||
run: rustfmt --check src/*rs
|
run: cargo fmt --check
|
||||||
|
|
||||||
Test:
|
Test:
|
||||||
strategy:
|
strategy:
|
||||||
|
|
|
@ -3,9 +3,9 @@
|
||||||
build
|
build
|
||||||
/target
|
/target
|
||||||
**/*.rs.bk
|
**/*.rs.bk
|
||||||
Cargo.lock
|
|
||||||
dist/
|
dist/
|
||||||
__pycache__/
|
__pycache__/
|
||||||
tantivy.so
|
tantivy.so
|
||||||
|
tantivy.dylib
|
||||||
tantivy/tantivy.cpython*.so
|
tantivy/tantivy.cpython*.so
|
||||||
tantivy.egg-info/
|
tantivy.egg-info/
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
16
Cargo.toml
16
Cargo.toml
|
@ -1,6 +1,6 @@
|
||||||
[package]
|
[package]
|
||||||
name = "tantivy"
|
name = "tantivy"
|
||||||
version = "0.17.0"
|
version = "0.19.2"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
authors = ["Damir Jelić <poljar@termina.org.uk>"]
|
authors = ["Damir Jelić <poljar@termina.org.uk>"]
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
|
@ -11,15 +11,15 @@ name = "tantivy"
|
||||||
crate-type = ["cdylib"]
|
crate-type = ["cdylib"]
|
||||||
|
|
||||||
[build-dependencies]
|
[build-dependencies]
|
||||||
pyo3-build-config = "0.16.3"
|
pyo3-build-config = "0.18.0"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
chrono = "0.4.19"
|
chrono = "0.4.23"
|
||||||
tantivy = "0.17"
|
tantivy = "0.19.2"
|
||||||
itertools = "0.10.3"
|
itertools = "0.10.5"
|
||||||
futures = "0.3.21"
|
futures = "0.3.26"
|
||||||
serde_json = "1.0.64"
|
serde_json = "1.0.91"
|
||||||
|
|
||||||
[dependencies.pyo3]
|
[dependencies.pyo3]
|
||||||
version = "0.16.3"
|
version = "0.18.0"
|
||||||
features = ["extension-module"]
|
features = ["extension-module"]
|
12
Makefile
12
Makefile
|
@ -1,4 +1,4 @@
|
||||||
ifeq ($(shell UNAME),Darwin)
|
ifeq ($(shell uname),Darwin)
|
||||||
EXT := dylib
|
EXT := dylib
|
||||||
else
|
else
|
||||||
EXT := so
|
EXT := so
|
||||||
|
@ -6,15 +6,21 @@ endif
|
||||||
|
|
||||||
source_files := $(wildcard src/*.rs)
|
source_files := $(wildcard src/*.rs)
|
||||||
|
|
||||||
all: tantivy/tantivy.$(EXT)
|
all: format lint build test
|
||||||
|
|
||||||
PHONY: test format
|
PHONY: test format
|
||||||
|
|
||||||
|
lint:
|
||||||
|
cargo clippy
|
||||||
|
|
||||||
test: tantivy/tantivy.$(EXT)
|
test: tantivy/tantivy.$(EXT)
|
||||||
python3 -m pytest
|
python3 -m pytest
|
||||||
|
|
||||||
format:
|
format:
|
||||||
rustfmt src/*.rs
|
cargo fmt
|
||||||
|
|
||||||
|
build:
|
||||||
|
maturin build --interpreter python3.7 python3.8 python3.9 python3.10 python3.11
|
||||||
|
|
||||||
tantivy/tantivy.$(EXT): target/debug/libtantivy.$(EXT)
|
tantivy/tantivy.$(EXT): target/debug/libtantivy.$(EXT)
|
||||||
cp target/debug/libtantivy.$(EXT) tantivy/tantivy.so
|
cp target/debug/libtantivy.$(EXT) tantivy/tantivy.so
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["maturin"]
|
requires = ["maturin>=0.13,<0.14"]
|
||||||
build-backend = "maturin"
|
build-backend = "maturin"
|
||||||
|
|
||||||
[project]
|
[project]
|
||||||
|
|
|
@ -9,7 +9,7 @@ use pyo3::{
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
use chrono::{offset::TimeZone, Datelike, Timelike, Utc};
|
use chrono::{offset::TimeZone, Utc};
|
||||||
|
|
||||||
use tantivy as tv;
|
use tantivy as tv;
|
||||||
|
|
||||||
|
@ -56,37 +56,42 @@ fn value_to_py(py: Python, value: &Value) -> PyResult<PyObject> {
|
||||||
// TODO implement me
|
// TODO implement me
|
||||||
unimplemented!();
|
unimplemented!();
|
||||||
}
|
}
|
||||||
Value::Date(d) => PyDateTime::new(
|
Value::Date(d) => {
|
||||||
|
let utc = d.into_utc();
|
||||||
|
PyDateTime::new(
|
||||||
py,
|
py,
|
||||||
d.year(),
|
utc.year(),
|
||||||
d.month() as u8,
|
utc.month() as u8,
|
||||||
d.day() as u8,
|
utc.day(),
|
||||||
d.hour() as u8,
|
utc.hour(),
|
||||||
d.minute() as u8,
|
utc.minute(),
|
||||||
d.second() as u8,
|
utc.second(),
|
||||||
d.timestamp_subsec_micros(),
|
utc.microsecond(),
|
||||||
None,
|
None,
|
||||||
)?
|
)?
|
||||||
.into_py(py),
|
.into_py(py)
|
||||||
|
}
|
||||||
Value::Facet(f) => Facet { inner: f.clone() }.into_py(py),
|
Value::Facet(f) => Facet { inner: f.clone() }.into_py(py),
|
||||||
Value::JsonObject(json_object) => {
|
Value::JsonObject(json_object) => {
|
||||||
let inner: HashMap<_, _> = json_object
|
let inner: HashMap<_, _> = json_object
|
||||||
.iter()
|
.iter()
|
||||||
.map(|(k, v)| (k, value_to_object(&v, py)))
|
.map(|(k, v)| (k, value_to_object(v, py)))
|
||||||
.collect();
|
.collect();
|
||||||
inner.to_object(py)
|
inner.to_object(py)
|
||||||
}
|
}
|
||||||
|
Value::Bool(b) => b.into_py(py),
|
||||||
|
Value::IpAddr(i) => (*i).to_string().into_py(py),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn value_to_string(value: &Value) -> String {
|
fn value_to_string(value: &Value) -> String {
|
||||||
match value {
|
match value {
|
||||||
Value::Str(text) => text.clone(),
|
Value::Str(text) => text.clone(),
|
||||||
Value::U64(num) => format!("{}", num),
|
Value::U64(num) => format!("{num}"),
|
||||||
Value::I64(num) => format!("{}", num),
|
Value::I64(num) => format!("{num}"),
|
||||||
Value::F64(num) => format!("{}", num),
|
Value::F64(num) => format!("{num}"),
|
||||||
Value::Bytes(bytes) => format!("{:?}", bytes),
|
Value::Bytes(bytes) => format!("{bytes:?}"),
|
||||||
Value::Date(d) => format!("{:?}", d),
|
Value::Date(d) => format!("{d:?}"),
|
||||||
Value::Facet(facet) => facet.to_string(),
|
Value::Facet(facet) => facet.to_string(),
|
||||||
Value::PreTokStr(_pretok) => {
|
Value::PreTokStr(_pretok) => {
|
||||||
// TODO implement me
|
// TODO implement me
|
||||||
|
@ -95,6 +100,8 @@ fn value_to_string(value: &Value) -> String {
|
||||||
Value::JsonObject(json_object) => {
|
Value::JsonObject(json_object) => {
|
||||||
serde_json::to_string(&json_object).unwrap()
|
serde_json::to_string(&json_object).unwrap()
|
||||||
}
|
}
|
||||||
|
Value::Bool(b) => format!("{b}"),
|
||||||
|
Value::IpAddr(i) => format!("{}", *i),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -141,10 +148,10 @@ impl fmt::Debug for Document {
|
||||||
.chars()
|
.chars()
|
||||||
.take(10)
|
.take(10)
|
||||||
.collect();
|
.collect();
|
||||||
format!("{}=[{}]", field_name, values_str)
|
format!("{field_name}=[{values_str}]")
|
||||||
})
|
})
|
||||||
.join(",");
|
.join(",");
|
||||||
write!(f, "Document({})", doc_str)
|
write!(f, "Document({doc_str})")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -170,23 +177,24 @@ pub(crate) fn extract_value(any: &PyAny) -> PyResult<Value> {
|
||||||
}
|
}
|
||||||
if let Ok(py_datetime) = any.downcast::<PyDateTime>() {
|
if let Ok(py_datetime) = any.downcast::<PyDateTime>() {
|
||||||
let datetime = Utc
|
let datetime = Utc
|
||||||
.ymd(
|
.with_ymd_and_hms(
|
||||||
py_datetime.get_year(),
|
py_datetime.get_year(),
|
||||||
py_datetime.get_month().into(),
|
py_datetime.get_month().into(),
|
||||||
py_datetime.get_day().into(),
|
py_datetime.get_day().into(),
|
||||||
)
|
|
||||||
.and_hms_micro(
|
|
||||||
py_datetime.get_hour().into(),
|
py_datetime.get_hour().into(),
|
||||||
py_datetime.get_minute().into(),
|
py_datetime.get_minute().into(),
|
||||||
py_datetime.get_second().into(),
|
py_datetime.get_second().into(),
|
||||||
py_datetime.get_microsecond(),
|
)
|
||||||
);
|
.single()
|
||||||
return Ok(Value::Date(datetime));
|
.unwrap();
|
||||||
|
return Ok(Value::Date(tv::DateTime::from_timestamp_secs(
|
||||||
|
datetime.timestamp(),
|
||||||
|
)));
|
||||||
}
|
}
|
||||||
if let Ok(facet) = any.extract::<Facet>() {
|
if let Ok(facet) = any.extract::<Facet>() {
|
||||||
return Ok(Value::Facet(facet.inner.clone()));
|
return Ok(Value::Facet(facet.inner));
|
||||||
}
|
}
|
||||||
Err(to_pyerr(format!("Value unsupported {:?}", any)))
|
Err(to_pyerr(format!("Value unsupported {any:?}")))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn extract_value_single_or_list(any: &PyAny) -> PyResult<Vec<Value>> {
|
fn extract_value_single_or_list(any: &PyAny) -> PyResult<Vec<Value>> {
|
||||||
|
@ -200,7 +208,7 @@ fn extract_value_single_or_list(any: &PyAny) -> PyResult<Vec<Value>> {
|
||||||
#[pymethods]
|
#[pymethods]
|
||||||
impl Document {
|
impl Document {
|
||||||
#[new]
|
#[new]
|
||||||
#[args(kwargs = "**")]
|
#[pyo3(signature = (**kwargs))]
|
||||||
fn new(kwargs: Option<&PyDict>) -> PyResult<Self> {
|
fn new(kwargs: Option<&PyDict>) -> PyResult<Self> {
|
||||||
let mut document = Document::default();
|
let mut document = Document::default();
|
||||||
if let Some(field_dict) = kwargs {
|
if let Some(field_dict) = kwargs {
|
||||||
|
@ -299,18 +307,21 @@ impl Document {
|
||||||
/// value (datetime): The date that will be added to the document.
|
/// value (datetime): The date that will be added to the document.
|
||||||
fn add_date(&mut self, field_name: String, value: &PyDateTime) {
|
fn add_date(&mut self, field_name: String, value: &PyDateTime) {
|
||||||
let datetime = Utc
|
let datetime = Utc
|
||||||
.ymd(
|
.with_ymd_and_hms(
|
||||||
value.get_year(),
|
value.get_year(),
|
||||||
value.get_month().into(),
|
value.get_month().into(),
|
||||||
value.get_day().into(),
|
value.get_day().into(),
|
||||||
)
|
|
||||||
.and_hms_micro(
|
|
||||||
value.get_hour().into(),
|
value.get_hour().into(),
|
||||||
value.get_minute().into(),
|
value.get_minute().into(),
|
||||||
value.get_second().into(),
|
value.get_second().into(),
|
||||||
value.get_microsecond(),
|
)
|
||||||
|
.single()
|
||||||
|
.unwrap();
|
||||||
|
add_value(
|
||||||
|
self,
|
||||||
|
field_name,
|
||||||
|
tv::DateTime::from_timestamp_secs(datetime.timestamp()),
|
||||||
);
|
);
|
||||||
add_value(self, field_name, datetime);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Add a facet value to the document.
|
/// Add a facet value to the document.
|
||||||
|
@ -387,13 +398,13 @@ impl Document {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn __getitem__(&self, field_name: &str) -> PyResult<Vec<PyObject>> {
|
fn __getitem__(&self, field_name: &str) -> PyResult<Vec<PyObject>> {
|
||||||
let gil = Python::acquire_gil();
|
Python::with_gil(|py| -> PyResult<Vec<PyObject>> {
|
||||||
let py = gil.python();
|
|
||||||
self.get_all(py, field_name)
|
self.get_all(py, field_name)
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn __repr__(&self) -> PyResult<String> {
|
fn __repr__(&self) -> PyResult<String> {
|
||||||
Ok(format!("{:?}", self))
|
Ok(format!("{self:?}"))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
78
src/index.rs
78
src/index.rs
|
@ -14,6 +14,10 @@ use tantivy as tv;
|
||||||
use tantivy::{
|
use tantivy::{
|
||||||
directory::MmapDirectory,
|
directory::MmapDirectory,
|
||||||
schema::{NamedFieldDocument, Term, Value},
|
schema::{NamedFieldDocument, Term, Value},
|
||||||
|
tokenizer::{
|
||||||
|
Language, LowerCaser, RemoveLongFilter, SimpleTokenizer, Stemmer,
|
||||||
|
TextAnalyzer,
|
||||||
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
const RELOAD_POLICY: &str = "commit";
|
const RELOAD_POLICY: &str = "commit";
|
||||||
|
@ -120,26 +124,25 @@ impl IndexWriter {
|
||||||
Value::U64(num) => Term::from_field_u64(field, num),
|
Value::U64(num) => Term::from_field_u64(field, num),
|
||||||
Value::I64(num) => Term::from_field_i64(field, num),
|
Value::I64(num) => Term::from_field_i64(field, num),
|
||||||
Value::F64(num) => Term::from_field_f64(field, num),
|
Value::F64(num) => Term::from_field_f64(field, num),
|
||||||
Value::Date(d) => Term::from_field_date(field, &d),
|
Value::Date(d) => Term::from_field_date(field, d),
|
||||||
Value::Facet(facet) => Term::from_facet(field, &facet),
|
Value::Facet(facet) => Term::from_facet(field, &facet),
|
||||||
Value::Bytes(_) => {
|
Value::Bytes(_) => {
|
||||||
return Err(exceptions::PyValueError::new_err(format!(
|
return Err(exceptions::PyValueError::new_err(format!(
|
||||||
"Field `{}` is bytes type not deletable.",
|
"Field `{field_name}` is bytes type not deletable."
|
||||||
field_name
|
|
||||||
)))
|
)))
|
||||||
}
|
}
|
||||||
Value::PreTokStr(_pretok) => {
|
Value::PreTokStr(_pretok) => {
|
||||||
return Err(exceptions::PyValueError::new_err(format!(
|
return Err(exceptions::PyValueError::new_err(format!(
|
||||||
"Field `{}` is pretokenized. This is not authorized for delete.",
|
"Field `{field_name}` is pretokenized. This is not authorized for delete."
|
||||||
field_name
|
|
||||||
)))
|
)))
|
||||||
}
|
}
|
||||||
Value::JsonObject(_) => {
|
Value::JsonObject(_) => {
|
||||||
return Err(exceptions::PyValueError::new_err(format!(
|
return Err(exceptions::PyValueError::new_err(format!(
|
||||||
"Field `{}` is json object type not deletable.",
|
"Field `{field_name}` is json object type not deletable."
|
||||||
field_name
|
|
||||||
)))
|
)))
|
||||||
}
|
},
|
||||||
|
Value::Bool(b) => Term::from_field_bool(field, b),
|
||||||
|
Value::IpAddr(i) => Term::from_field_ip_addr(field, i)
|
||||||
};
|
};
|
||||||
Ok(self.inner_index_writer.delete_term(term))
|
Ok(self.inner_index_writer.delete_term(term))
|
||||||
}
|
}
|
||||||
|
@ -167,12 +170,15 @@ impl Index {
|
||||||
#[staticmethod]
|
#[staticmethod]
|
||||||
fn open(path: &str) -> PyResult<Index> {
|
fn open(path: &str) -> PyResult<Index> {
|
||||||
let index = tv::Index::open_in_dir(path).map_err(to_pyerr)?;
|
let index = tv::Index::open_in_dir(path).map_err(to_pyerr)?;
|
||||||
|
|
||||||
|
Index::register_custom_text_analyzers(&index);
|
||||||
|
|
||||||
let reader = index.reader().map_err(to_pyerr)?;
|
let reader = index.reader().map_err(to_pyerr)?;
|
||||||
Ok(Index { index, reader })
|
Ok(Index { index, reader })
|
||||||
}
|
}
|
||||||
|
|
||||||
#[new]
|
#[new]
|
||||||
#[args(reuse = true)]
|
#[pyo3(signature = (schema, path = None, reuse = true))]
|
||||||
fn new(schema: &Schema, path: Option<&str>, reuse: bool) -> PyResult<Self> {
|
fn new(schema: &Schema, path: Option<&str>, reuse: bool) -> PyResult<Self> {
|
||||||
let index = match path {
|
let index = match path {
|
||||||
Some(p) => {
|
Some(p) => {
|
||||||
|
@ -191,6 +197,8 @@ impl Index {
|
||||||
None => tv::Index::create_in_ram(schema.inner.clone()),
|
None => tv::Index::create_in_ram(schema.inner.clone()),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
Index::register_custom_text_analyzers(&index);
|
||||||
|
|
||||||
let reader = index.reader().map_err(to_pyerr)?;
|
let reader = index.reader().map_err(to_pyerr)?;
|
||||||
Ok(Index { index, reader })
|
Ok(Index { index, reader })
|
||||||
}
|
}
|
||||||
|
@ -208,7 +216,7 @@ impl Index {
|
||||||
/// automatically the number of threads.
|
/// automatically the number of threads.
|
||||||
///
|
///
|
||||||
/// Raises ValueError if there was an error while creating the writer.
|
/// Raises ValueError if there was an error while creating the writer.
|
||||||
#[args(heap_size = 3000000, num_threads = 0)]
|
#[pyo3(signature = (heap_size = 3000000, num_threads = 0))]
|
||||||
fn writer(
|
fn writer(
|
||||||
&self,
|
&self,
|
||||||
heap_size: usize,
|
heap_size: usize,
|
||||||
|
@ -231,13 +239,13 @@ impl Index {
|
||||||
/// Args:
|
/// Args:
|
||||||
/// reload_policy (str, optional): The reload policy that the
|
/// reload_policy (str, optional): The reload policy that the
|
||||||
/// IndexReader should use. Can be `Manual` or `OnCommit`.
|
/// IndexReader should use. Can be `Manual` or `OnCommit`.
|
||||||
/// num_searchers (int, optional): The number of searchers that the
|
/// num_warmers (int, optional): The number of searchers that the
|
||||||
/// reader should create.
|
/// reader should create.
|
||||||
#[args(reload_policy = "RELOAD_POLICY", num_searchers = 0)]
|
#[pyo3(signature = (reload_policy = RELOAD_POLICY, num_warmers = 0))]
|
||||||
fn config_reader(
|
fn config_reader(
|
||||||
&mut self,
|
&mut self,
|
||||||
reload_policy: &str,
|
reload_policy: &str,
|
||||||
num_searchers: usize,
|
num_warmers: usize,
|
||||||
) -> Result<(), PyErr> {
|
) -> Result<(), PyErr> {
|
||||||
let reload_policy = reload_policy.to_lowercase();
|
let reload_policy = reload_policy.to_lowercase();
|
||||||
let reload_policy = match reload_policy.as_ref() {
|
let reload_policy = match reload_policy.as_ref() {
|
||||||
|
@ -251,8 +259,8 @@ impl Index {
|
||||||
};
|
};
|
||||||
let builder = self.index.reader_builder();
|
let builder = self.index.reader_builder();
|
||||||
let builder = builder.reload_policy(reload_policy);
|
let builder = builder.reload_policy(reload_policy);
|
||||||
let builder = if num_searchers > 0 {
|
let builder = if num_warmers > 0 {
|
||||||
builder.num_searchers(num_searchers)
|
builder.num_warming_threads(num_warmers)
|
||||||
} else {
|
} else {
|
||||||
builder
|
builder
|
||||||
};
|
};
|
||||||
|
@ -313,7 +321,7 @@ impl Index {
|
||||||
/// default_fields_names (List[Field]): A list of fields used to search if no
|
/// default_fields_names (List[Field]): A list of fields used to search if no
|
||||||
/// field is specified in the query.
|
/// field is specified in the query.
|
||||||
///
|
///
|
||||||
#[args(reload_policy = "RELOAD_POLICY")]
|
#[pyo3(signature = (query, default_field_names = None))]
|
||||||
pub fn parse_query(
|
pub fn parse_query(
|
||||||
&self,
|
&self,
|
||||||
query: &str,
|
query: &str,
|
||||||
|
@ -328,16 +336,14 @@ impl Index {
|
||||||
if !field_entry.is_indexed() {
|
if !field_entry.is_indexed() {
|
||||||
return Err(exceptions::PyValueError::new_err(
|
return Err(exceptions::PyValueError::new_err(
|
||||||
format!(
|
format!(
|
||||||
"Field `{}` is not set as indexed in the schema.",
|
"Field `{default_field_name}` is not set as indexed in the schema."
|
||||||
default_field_name
|
|
||||||
),
|
),
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
default_fields.push(field);
|
default_fields.push(field);
|
||||||
} else {
|
} else {
|
||||||
return Err(exceptions::PyValueError::new_err(format!(
|
return Err(exceptions::PyValueError::new_err(format!(
|
||||||
"Field `{}` is not defined in the schema.",
|
"Field `{default_field_name}` is not defined in the schema."
|
||||||
default_field_name
|
|
||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -355,3 +361,35 @@ impl Index {
|
||||||
Ok(Query { inner: query })
|
Ok(Query { inner: query })
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl Index {
|
||||||
|
fn register_custom_text_analyzers(index: &tv::Index) {
|
||||||
|
let analyzers = [
|
||||||
|
("ar_stem", Language::Arabic),
|
||||||
|
("da_stem", Language::Danish),
|
||||||
|
("nl_stem", Language::Dutch),
|
||||||
|
("fi_stem", Language::Finnish),
|
||||||
|
("fr_stem", Language::French),
|
||||||
|
("de_stem", Language::German),
|
||||||
|
("el_stem", Language::Greek),
|
||||||
|
("hu_stem", Language::Hungarian),
|
||||||
|
("it_stem", Language::Italian),
|
||||||
|
("no_stem", Language::Norwegian),
|
||||||
|
("pt_stem", Language::Portuguese),
|
||||||
|
("ro_stem", Language::Romanian),
|
||||||
|
("ru_stem", Language::Russian),
|
||||||
|
("es_stem", Language::Spanish),
|
||||||
|
("sv_stem", Language::Swedish),
|
||||||
|
("ta_stem", Language::Tamil),
|
||||||
|
("tr_stem", Language::Turkish),
|
||||||
|
];
|
||||||
|
|
||||||
|
for (name, lang) in &analyzers {
|
||||||
|
let an = TextAnalyzer::from(SimpleTokenizer)
|
||||||
|
.filter(RemoveLongFilter::limit(40))
|
||||||
|
.filter(LowerCaser)
|
||||||
|
.filter(Stemmer::new(*lang));
|
||||||
|
index.tokenizers().register(name, an);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
|
use ::tantivy as tv;
|
||||||
use pyo3::{exceptions, prelude::*};
|
use pyo3::{exceptions, prelude::*};
|
||||||
use tantivy as tv;
|
|
||||||
|
|
||||||
mod document;
|
mod document;
|
||||||
mod facet;
|
mod facet;
|
||||||
|
@ -88,8 +88,7 @@ pub(crate) fn get_field(
|
||||||
) -> PyResult<tv::schema::Field> {
|
) -> PyResult<tv::schema::Field> {
|
||||||
let field = schema.get_field(field_name).ok_or_else(|| {
|
let field = schema.get_field(field_name).ok_or_else(|| {
|
||||||
exceptions::PyValueError::new_err(format!(
|
exceptions::PyValueError::new_err(format!(
|
||||||
"Field `{}` is not defined in the schema.",
|
"Field `{field_name}` is not defined in the schema."
|
||||||
field_name
|
|
||||||
))
|
))
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,7 @@ use tantivy::schema;
|
||||||
|
|
||||||
use crate::schema::Schema;
|
use crate::schema::Schema;
|
||||||
use std::sync::{Arc, RwLock};
|
use std::sync::{Arc, RwLock};
|
||||||
use tantivy::schema::INDEXED;
|
use tantivy::schema::{DateOptions, INDEXED};
|
||||||
|
|
||||||
/// Tantivy has a very strict schema.
|
/// Tantivy has a very strict schema.
|
||||||
/// You need to specify in advance whether a field is indexed or not,
|
/// You need to specify in advance whether a field is indexed or not,
|
||||||
|
@ -60,11 +60,12 @@ impl SchemaBuilder {
|
||||||
///
|
///
|
||||||
/// Returns the associated field handle.
|
/// Returns the associated field handle.
|
||||||
/// Raises a ValueError if there was an error with the field creation.
|
/// Raises a ValueError if there was an error with the field creation.
|
||||||
#[args(
|
#[pyo3(signature = (
|
||||||
|
name,
|
||||||
stored = false,
|
stored = false,
|
||||||
tokenizer_name = "TOKENIZER",
|
tokenizer_name = TOKENIZER,
|
||||||
index_option = "RECORD"
|
index_option = RECORD
|
||||||
)]
|
))]
|
||||||
fn add_text_field(
|
fn add_text_field(
|
||||||
&mut self,
|
&mut self,
|
||||||
name: &str,
|
name: &str,
|
||||||
|
@ -109,7 +110,7 @@ impl SchemaBuilder {
|
||||||
///
|
///
|
||||||
/// Returns the associated field handle.
|
/// Returns the associated field handle.
|
||||||
/// Raises a ValueError if there was an error with the field creation.
|
/// Raises a ValueError if there was an error with the field creation.
|
||||||
#[args(stored = false, indexed = false)]
|
#[pyo3(signature = (name, stored = false, indexed = false, fast = None))]
|
||||||
fn add_integer_field(
|
fn add_integer_field(
|
||||||
&mut self,
|
&mut self,
|
||||||
name: &str,
|
name: &str,
|
||||||
|
@ -151,7 +152,7 @@ impl SchemaBuilder {
|
||||||
///
|
///
|
||||||
/// Returns the associated field handle.
|
/// Returns the associated field handle.
|
||||||
/// Raises a ValueError if there was an error with the field creation.
|
/// Raises a ValueError if there was an error with the field creation.
|
||||||
#[args(stored = false, indexed = false)]
|
#[pyo3(signature = (name, stored = false, indexed = false, fast = None))]
|
||||||
fn add_unsigned_field(
|
fn add_unsigned_field(
|
||||||
&mut self,
|
&mut self,
|
||||||
name: &str,
|
name: &str,
|
||||||
|
@ -185,7 +186,7 @@ impl SchemaBuilder {
|
||||||
/// field. Fast fields are designed for random access. Access time
|
/// field. Fast fields are designed for random access. Access time
|
||||||
/// are similar to a random lookup in an array. If more than one
|
/// are similar to a random lookup in an array. If more than one
|
||||||
/// value is associated to a fast field, only the last one is kept.
|
/// value is associated to a fast field, only the last one is kept.
|
||||||
/// Can be one of 'single' or 'multi'. If this is set to 'single,
|
/// Can be one of 'single' or 'multi'. If this is set to 'single',
|
||||||
/// the document must have exactly one value associated to the
|
/// the document must have exactly one value associated to the
|
||||||
/// document. If this is set to 'multi', the document can have any
|
/// document. If this is set to 'multi', the document can have any
|
||||||
/// number of values associated to the document. Defaults to None,
|
/// number of values associated to the document. Defaults to None,
|
||||||
|
@ -193,7 +194,7 @@ impl SchemaBuilder {
|
||||||
///
|
///
|
||||||
/// Returns the associated field handle.
|
/// Returns the associated field handle.
|
||||||
/// Raises a ValueError if there was an error with the field creation.
|
/// Raises a ValueError if there was an error with the field creation.
|
||||||
#[args(stored = false, indexed = false)]
|
#[pyo3(signature = (name, stored = false, indexed = false, fast = None))]
|
||||||
fn add_date_field(
|
fn add_date_field(
|
||||||
&mut self,
|
&mut self,
|
||||||
name: &str,
|
name: &str,
|
||||||
|
@ -203,7 +204,29 @@ impl SchemaBuilder {
|
||||||
) -> PyResult<Self> {
|
) -> PyResult<Self> {
|
||||||
let builder = &mut self.builder;
|
let builder = &mut self.builder;
|
||||||
|
|
||||||
let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?;
|
let mut opts = DateOptions::default();
|
||||||
|
if stored {
|
||||||
|
opts = opts.set_stored();
|
||||||
|
}
|
||||||
|
if indexed {
|
||||||
|
opts = opts.set_indexed();
|
||||||
|
}
|
||||||
|
let fast = match fast {
|
||||||
|
Some(f) => {
|
||||||
|
let f = f.to_lowercase();
|
||||||
|
match f.as_ref() {
|
||||||
|
"single" => Some(schema::Cardinality::SingleValue),
|
||||||
|
"multi" => Some(schema::Cardinality::MultiValues),
|
||||||
|
_ => return Err(exceptions::PyValueError::new_err(
|
||||||
|
"Invalid index option, valid choices are: 'multi' and 'single'"
|
||||||
|
)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None => None,
|
||||||
|
};
|
||||||
|
if let Some(f) = fast {
|
||||||
|
opts = opts.set_fast(f);
|
||||||
|
}
|
||||||
|
|
||||||
if let Some(builder) = builder.write().unwrap().as_mut() {
|
if let Some(builder) = builder.write().unwrap().as_mut() {
|
||||||
builder.add_date_field(name, opts);
|
builder.add_date_field(name, opts);
|
||||||
|
@ -234,11 +257,12 @@ impl SchemaBuilder {
|
||||||
///
|
///
|
||||||
/// Returns the associated field handle.
|
/// Returns the associated field handle.
|
||||||
/// Raises a ValueError if there was an error with the field creation.
|
/// Raises a ValueError if there was an error with the field creation.
|
||||||
#[args(
|
#[pyo3(signature = (
|
||||||
|
name,
|
||||||
stored = false,
|
stored = false,
|
||||||
tokenizer_name = "TOKENIZER",
|
tokenizer_name = TOKENIZER,
|
||||||
index_option = "RECORD"
|
index_option = RECORD
|
||||||
)]
|
))]
|
||||||
fn add_json_field(
|
fn add_json_field(
|
||||||
&mut self,
|
&mut self,
|
||||||
name: &str,
|
name: &str,
|
||||||
|
|
|
@ -10,7 +10,7 @@ use tantivy::collector::{Count, MultiCollector, TopDocs};
|
||||||
/// A Searcher is used to search the index given a prepared Query.
|
/// A Searcher is used to search the index given a prepared Query.
|
||||||
#[pyclass]
|
#[pyclass]
|
||||||
pub(crate) struct Searcher {
|
pub(crate) struct Searcher {
|
||||||
pub(crate) inner: tv::LeasedItem<tv::Searcher>,
|
pub(crate) inner: tv::Searcher,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
|
@ -22,8 +22,8 @@ enum Fruit {
|
||||||
impl std::fmt::Debug for Fruit {
|
impl std::fmt::Debug for Fruit {
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
match self {
|
match self {
|
||||||
Fruit::Score(s) => f.write_str(&format!("{}", s)),
|
Fruit::Score(s) => f.write_str(&format!("{s}")),
|
||||||
Fruit::Order(o) => f.write_str(&format!("{}", o)),
|
Fruit::Order(o) => f.write_str(&format!("{o}")),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -93,7 +93,7 @@ impl Searcher {
|
||||||
/// Returns `SearchResult` object.
|
/// Returns `SearchResult` object.
|
||||||
///
|
///
|
||||||
/// Raises a ValueError if there was an error with the search.
|
/// Raises a ValueError if there was an error with the search.
|
||||||
#[args(limit = 10, offset = 0, count = true)]
|
#[pyo3(signature = (query, limit = 10, count = true, order_by_field = None, offset = 0))]
|
||||||
fn search(
|
fn search(
|
||||||
&self,
|
&self,
|
||||||
_py: Python,
|
_py: Python,
|
||||||
|
@ -154,10 +154,7 @@ impl Searcher {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let count = match count_handle {
|
let count = count_handle.map(|h| h.extract(&mut multifruit));
|
||||||
Some(h) => Some(h.extract(&mut multifruit)),
|
|
||||||
None => None,
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok(SearchResult { hits, count })
|
Ok(SearchResult { hits, count })
|
||||||
}
|
}
|
||||||
|
@ -230,11 +227,11 @@ impl From<&tv::DocAddress> for DocAddress {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Into<tv::DocAddress> for &DocAddress {
|
impl From<&DocAddress> for tv::DocAddress {
|
||||||
fn into(self) -> tv::DocAddress {
|
fn from(val: &DocAddress) -> Self {
|
||||||
tv::DocAddress {
|
tv::DocAddress {
|
||||||
segment_ord: self.segment_ord(),
|
segment_ord: val.segment_ord(),
|
||||||
doc_id: self.doc(),
|
doc_id: val.doc(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
from .tantivy import *
|
|
@ -67,6 +67,56 @@ def create_index(dir=None):
|
||||||
return index
|
return index
|
||||||
|
|
||||||
|
|
||||||
|
def spanish_schema():
|
||||||
|
return (
|
||||||
|
SchemaBuilder()
|
||||||
|
.add_text_field("title", stored=True, tokenizer_name='es_stem')
|
||||||
|
.add_text_field("body", tokenizer_name='es_stem')
|
||||||
|
.build()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def create_spanish_index():
|
||||||
|
# assume all tests will use the same documents for now
|
||||||
|
# other methods may set up function-local indexes
|
||||||
|
index = Index(spanish_schema(), None)
|
||||||
|
writer = index.writer()
|
||||||
|
|
||||||
|
# 2 ways of adding documents
|
||||||
|
# 1
|
||||||
|
doc = Document()
|
||||||
|
# create a document instance
|
||||||
|
# add field-value pairs
|
||||||
|
doc.add_text("title", "El viejo y el mar")
|
||||||
|
doc.add_text(
|
||||||
|
"body",
|
||||||
|
(
|
||||||
|
"Era un viejo que pescaba solo en un bote en el Gulf Stream y hacía ochenta y cuatro días que no cogía un pez. "
|
||||||
|
),
|
||||||
|
)
|
||||||
|
writer.add_document(doc)
|
||||||
|
# 2 use the built-in json support
|
||||||
|
# keys need to coincide with field names
|
||||||
|
doc = Document.from_dict(
|
||||||
|
{
|
||||||
|
"title": "De ratones y hombres",
|
||||||
|
"body": (
|
||||||
|
"Unas millas al sur de Soledad, el río Salinas se ahonda junto al margen de la ladera y fluye profundo y verde. Es tibia el agua, porque se ha deslizado chispeante sobre la arena amarilla y al calor del sol antes de llegar a la angosta laguna. A un lado del río, la dorada falda de la ladera se curva hacia arriba trepando hasta las montañas Gabilán, fuertes y rocosas, pero del lado del valle los árboles bordean la orilla: sauces frescos y verdes cada primavera, que en la s junturas más bajas de sus hojas muestran las consecuencias de la crecida invernal; y sicomoros de troncos veteados, blancos, recostados, y ramas quesear quean sobre el estanque"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
writer.add_document(doc)
|
||||||
|
writer.add_json(
|
||||||
|
"""{
|
||||||
|
"title": ["Frankenstein", "El moderno Prometeo"],
|
||||||
|
"body": "Te alegrará saber que no ha ocurrido ningún percance al principio de una aventura que siempre consideraste cargada de malos presagios. Llegué aquí ayer, y mi primera tarea es asegurarle a mi querida hermana que me hallo perfectamente y que tengo una gran confianza en el éxito de mi empresa."
|
||||||
|
}"""
|
||||||
|
)
|
||||||
|
writer.commit()
|
||||||
|
index.reload()
|
||||||
|
return index
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
@pytest.fixture()
|
||||||
def dir_index(tmpdir):
|
def dir_index(tmpdir):
|
||||||
return (tmpdir, create_index(str(tmpdir)))
|
return (tmpdir, create_index(str(tmpdir)))
|
||||||
|
@ -77,6 +127,11 @@ def ram_index():
|
||||||
return create_index()
|
return create_index()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="class")
|
||||||
|
def spanish_index():
|
||||||
|
return create_spanish_index()
|
||||||
|
|
||||||
|
|
||||||
class TestClass(object):
|
class TestClass(object):
|
||||||
def test_simple_search_in_dir(self, dir_index):
|
def test_simple_search_in_dir(self, dir_index):
|
||||||
_, index = dir_index
|
_, index = dir_index
|
||||||
|
@ -103,6 +158,16 @@ class TestClass(object):
|
||||||
searched_doc = index.searcher().doc(doc_address)
|
searched_doc = index.searcher().doc(doc_address)
|
||||||
assert searched_doc["title"] == ["The Old Man and the Sea"]
|
assert searched_doc["title"] == ["The Old Man and the Sea"]
|
||||||
|
|
||||||
|
def test_simple_search_in_spanish(self, spanish_index):
|
||||||
|
index = spanish_index
|
||||||
|
query = index.parse_query("vieja", ["title", "body"])
|
||||||
|
|
||||||
|
result = index.searcher().search(query, 10)
|
||||||
|
assert len(result.hits) == 1
|
||||||
|
_, doc_address = result.hits[0]
|
||||||
|
search_doc = index.searcher().doc(doc_address)
|
||||||
|
assert search_doc["title"] == ["El viejo y el mar"]
|
||||||
|
|
||||||
def test_and_query(self, ram_index):
|
def test_and_query(self, ram_index):
|
||||||
index = ram_index
|
index = ram_index
|
||||||
query = index.parse_query(
|
query = index.parse_query(
|
||||||
|
|
Loading…
Reference in New Issue