Tantivy 0.19.2 (#67)

* Adding __init__.py file to the tantivy folder to make maturin happy

Add Cargo.lock to the repo

Set the git-fetch-with-cli cargo flag so that we can override fetch settings

Renaming .cargo/config to .cargo/config.toml

Adding github-quiq-sh cargo registry

Point dependencies at our github-quiq-sh registry

Trying to resolve this build issue, pointing pyo3-build-config at our github-quiq-sh registry

SER-21487: Enable support for all standard Tantivy languages plus Chinese + Japanese in tantivy-py

SER-21487: Use uname rather than UNAME in the Makefile

SER-21487: Fix document date handling

SER-23013: Upgrade Tantivy and other dependencies

* Upgrade to Tantivy 0.19.1

* Apply rustfmt and fix bug when fast option = None

* Upgrade to tantivy-0.19.2

* Standardize around using 'cargo fmt' rather than 'rustfmt'

* Reverting to old style dependencies

* Linting with clippy

* Switching out hashmap for defining tokenizers for an array, and adding test for Spanish indexing

* Use cargo fmt instead of rustfmt on the Lint ci step
master
Phill Mell-Davies 2023-02-14 08:20:59 -05:00 committed by GitHub
parent b2043793ee
commit 164adc87e1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 1820 additions and 106 deletions

View File

@ -2,4 +2,4 @@
rustflags = [
"-C", "link-arg=-undefined",
"-C", "link-arg=dynamic_lookup",
]
]

View File

@ -18,7 +18,7 @@ jobs:
toolchain: stable
components: rustfmt
- name: Check Formatting
run: rustfmt --check src/*rs
run: cargo fmt --check
Test:
strategy:

2
.gitignore vendored
View File

@ -3,9 +3,9 @@
build
/target
**/*.rs.bk
Cargo.lock
dist/
__pycache__/
tantivy.so
tantivy.dylib
tantivy/tantivy.cpython*.so
tantivy.egg-info/

1573
Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.17.0"
version = "0.19.2"
readme = "README.md"
authors = ["Damir Jelić <poljar@termina.org.uk>"]
edition = "2018"
@ -11,15 +11,15 @@ name = "tantivy"
crate-type = ["cdylib"]
[build-dependencies]
pyo3-build-config = "0.16.3"
pyo3-build-config = "0.18.0"
[dependencies]
chrono = "0.4.19"
tantivy = "0.17"
itertools = "0.10.3"
futures = "0.3.21"
serde_json = "1.0.64"
chrono = "0.4.23"
tantivy = "0.19.2"
itertools = "0.10.5"
futures = "0.3.26"
serde_json = "1.0.91"
[dependencies.pyo3]
version = "0.16.3"
features = ["extension-module"]
version = "0.18.0"
features = ["extension-module"]

View File

@ -1,4 +1,4 @@
ifeq ($(shell UNAME),Darwin)
ifeq ($(shell uname),Darwin)
EXT := dylib
else
EXT := so
@ -6,15 +6,21 @@ endif
source_files := $(wildcard src/*.rs)
all: tantivy/tantivy.$(EXT)
all: format lint build test
PHONY: test format
lint:
cargo clippy
test: tantivy/tantivy.$(EXT)
python3 -m pytest
format:
rustfmt src/*.rs
cargo fmt
build:
maturin build --interpreter python3.7 python3.8 python3.9 python3.10 python3.11
tantivy/tantivy.$(EXT): target/debug/libtantivy.$(EXT)
cp target/debug/libtantivy.$(EXT) tantivy/tantivy.so

View File

@ -1,5 +1,5 @@
[build-system]
requires = ["maturin"]
requires = ["maturin>=0.13,<0.14"]
build-backend = "maturin"
[project]

View File

@ -1 +1 @@
max_width = 80
max_width = 80

View File

@ -9,7 +9,7 @@ use pyo3::{
},
};
use chrono::{offset::TimeZone, Datelike, Timelike, Utc};
use chrono::{offset::TimeZone, Utc};
use tantivy as tv;
@ -56,37 +56,42 @@ fn value_to_py(py: Python, value: &Value) -> PyResult<PyObject> {
// TODO implement me
unimplemented!();
}
Value::Date(d) => PyDateTime::new(
py,
d.year(),
d.month() as u8,
d.day() as u8,
d.hour() as u8,
d.minute() as u8,
d.second() as u8,
d.timestamp_subsec_micros(),
None,
)?
.into_py(py),
Value::Date(d) => {
let utc = d.into_utc();
PyDateTime::new(
py,
utc.year(),
utc.month() as u8,
utc.day(),
utc.hour(),
utc.minute(),
utc.second(),
utc.microsecond(),
None,
)?
.into_py(py)
}
Value::Facet(f) => Facet { inner: f.clone() }.into_py(py),
Value::JsonObject(json_object) => {
let inner: HashMap<_, _> = json_object
.iter()
.map(|(k, v)| (k, value_to_object(&v, py)))
.map(|(k, v)| (k, value_to_object(v, py)))
.collect();
inner.to_object(py)
}
Value::Bool(b) => b.into_py(py),
Value::IpAddr(i) => (*i).to_string().into_py(py),
})
}
fn value_to_string(value: &Value) -> String {
match value {
Value::Str(text) => text.clone(),
Value::U64(num) => format!("{}", num),
Value::I64(num) => format!("{}", num),
Value::F64(num) => format!("{}", num),
Value::Bytes(bytes) => format!("{:?}", bytes),
Value::Date(d) => format!("{:?}", d),
Value::U64(num) => format!("{num}"),
Value::I64(num) => format!("{num}"),
Value::F64(num) => format!("{num}"),
Value::Bytes(bytes) => format!("{bytes:?}"),
Value::Date(d) => format!("{d:?}"),
Value::Facet(facet) => facet.to_string(),
Value::PreTokStr(_pretok) => {
// TODO implement me
@ -95,6 +100,8 @@ fn value_to_string(value: &Value) -> String {
Value::JsonObject(json_object) => {
serde_json::to_string(&json_object).unwrap()
}
Value::Bool(b) => format!("{b}"),
Value::IpAddr(i) => format!("{}", *i),
}
}
@ -141,10 +148,10 @@ impl fmt::Debug for Document {
.chars()
.take(10)
.collect();
format!("{}=[{}]", field_name, values_str)
format!("{field_name}=[{values_str}]")
})
.join(",");
write!(f, "Document({})", doc_str)
write!(f, "Document({doc_str})")
}
}
@ -170,23 +177,24 @@ pub(crate) fn extract_value(any: &PyAny) -> PyResult<Value> {
}
if let Ok(py_datetime) = any.downcast::<PyDateTime>() {
let datetime = Utc
.ymd(
.with_ymd_and_hms(
py_datetime.get_year(),
py_datetime.get_month().into(),
py_datetime.get_day().into(),
)
.and_hms_micro(
py_datetime.get_hour().into(),
py_datetime.get_minute().into(),
py_datetime.get_second().into(),
py_datetime.get_microsecond(),
);
return Ok(Value::Date(datetime));
)
.single()
.unwrap();
return Ok(Value::Date(tv::DateTime::from_timestamp_secs(
datetime.timestamp(),
)));
}
if let Ok(facet) = any.extract::<Facet>() {
return Ok(Value::Facet(facet.inner.clone()));
return Ok(Value::Facet(facet.inner));
}
Err(to_pyerr(format!("Value unsupported {:?}", any)))
Err(to_pyerr(format!("Value unsupported {any:?}")))
}
fn extract_value_single_or_list(any: &PyAny) -> PyResult<Vec<Value>> {
@ -200,7 +208,7 @@ fn extract_value_single_or_list(any: &PyAny) -> PyResult<Vec<Value>> {
#[pymethods]
impl Document {
#[new]
#[args(kwargs = "**")]
#[pyo3(signature = (**kwargs))]
fn new(kwargs: Option<&PyDict>) -> PyResult<Self> {
let mut document = Document::default();
if let Some(field_dict) = kwargs {
@ -299,18 +307,21 @@ impl Document {
/// value (datetime): The date that will be added to the document.
fn add_date(&mut self, field_name: String, value: &PyDateTime) {
let datetime = Utc
.ymd(
.with_ymd_and_hms(
value.get_year(),
value.get_month().into(),
value.get_day().into(),
)
.and_hms_micro(
value.get_hour().into(),
value.get_minute().into(),
value.get_second().into(),
value.get_microsecond(),
);
add_value(self, field_name, datetime);
)
.single()
.unwrap();
add_value(
self,
field_name,
tv::DateTime::from_timestamp_secs(datetime.timestamp()),
);
}
/// Add a facet value to the document.
@ -387,13 +398,13 @@ impl Document {
}
fn __getitem__(&self, field_name: &str) -> PyResult<Vec<PyObject>> {
let gil = Python::acquire_gil();
let py = gil.python();
self.get_all(py, field_name)
Python::with_gil(|py| -> PyResult<Vec<PyObject>> {
self.get_all(py, field_name)
})
}
fn __repr__(&self) -> PyResult<String> {
Ok(format!("{:?}", self))
Ok(format!("{self:?}"))
}
}

View File

@ -14,6 +14,10 @@ use tantivy as tv;
use tantivy::{
directory::MmapDirectory,
schema::{NamedFieldDocument, Term, Value},
tokenizer::{
Language, LowerCaser, RemoveLongFilter, SimpleTokenizer, Stemmer,
TextAnalyzer,
},
};
const RELOAD_POLICY: &str = "commit";
@ -120,26 +124,25 @@ impl IndexWriter {
Value::U64(num) => Term::from_field_u64(field, num),
Value::I64(num) => Term::from_field_i64(field, num),
Value::F64(num) => Term::from_field_f64(field, num),
Value::Date(d) => Term::from_field_date(field, &d),
Value::Date(d) => Term::from_field_date(field, d),
Value::Facet(facet) => Term::from_facet(field, &facet),
Value::Bytes(_) => {
return Err(exceptions::PyValueError::new_err(format!(
"Field `{}` is bytes type not deletable.",
field_name
"Field `{field_name}` is bytes type not deletable."
)))
}
Value::PreTokStr(_pretok) => {
return Err(exceptions::PyValueError::new_err(format!(
"Field `{}` is pretokenized. This is not authorized for delete.",
field_name
"Field `{field_name}` is pretokenized. This is not authorized for delete."
)))
}
Value::JsonObject(_) => {
return Err(exceptions::PyValueError::new_err(format!(
"Field `{}` is json object type not deletable.",
field_name
"Field `{field_name}` is json object type not deletable."
)))
}
},
Value::Bool(b) => Term::from_field_bool(field, b),
Value::IpAddr(i) => Term::from_field_ip_addr(field, i)
};
Ok(self.inner_index_writer.delete_term(term))
}
@ -167,12 +170,15 @@ impl Index {
#[staticmethod]
fn open(path: &str) -> PyResult<Index> {
let index = tv::Index::open_in_dir(path).map_err(to_pyerr)?;
Index::register_custom_text_analyzers(&index);
let reader = index.reader().map_err(to_pyerr)?;
Ok(Index { index, reader })
}
#[new]
#[args(reuse = true)]
#[pyo3(signature = (schema, path = None, reuse = true))]
fn new(schema: &Schema, path: Option<&str>, reuse: bool) -> PyResult<Self> {
let index = match path {
Some(p) => {
@ -191,6 +197,8 @@ impl Index {
None => tv::Index::create_in_ram(schema.inner.clone()),
};
Index::register_custom_text_analyzers(&index);
let reader = index.reader().map_err(to_pyerr)?;
Ok(Index { index, reader })
}
@ -208,7 +216,7 @@ impl Index {
/// automatically the number of threads.
///
/// Raises ValueError if there was an error while creating the writer.
#[args(heap_size = 3000000, num_threads = 0)]
#[pyo3(signature = (heap_size = 3000000, num_threads = 0))]
fn writer(
&self,
heap_size: usize,
@ -231,13 +239,13 @@ impl Index {
/// Args:
/// reload_policy (str, optional): The reload policy that the
/// IndexReader should use. Can be `Manual` or `OnCommit`.
/// num_searchers (int, optional): The number of searchers that the
/// num_warmers (int, optional): The number of searchers that the
/// reader should create.
#[args(reload_policy = "RELOAD_POLICY", num_searchers = 0)]
#[pyo3(signature = (reload_policy = RELOAD_POLICY, num_warmers = 0))]
fn config_reader(
&mut self,
reload_policy: &str,
num_searchers: usize,
num_warmers: usize,
) -> Result<(), PyErr> {
let reload_policy = reload_policy.to_lowercase();
let reload_policy = match reload_policy.as_ref() {
@ -251,8 +259,8 @@ impl Index {
};
let builder = self.index.reader_builder();
let builder = builder.reload_policy(reload_policy);
let builder = if num_searchers > 0 {
builder.num_searchers(num_searchers)
let builder = if num_warmers > 0 {
builder.num_warming_threads(num_warmers)
} else {
builder
};
@ -313,7 +321,7 @@ impl Index {
/// default_fields_names (List[Field]): A list of fields used to search if no
/// field is specified in the query.
///
#[args(reload_policy = "RELOAD_POLICY")]
#[pyo3(signature = (query, default_field_names = None))]
pub fn parse_query(
&self,
query: &str,
@ -328,16 +336,14 @@ impl Index {
if !field_entry.is_indexed() {
return Err(exceptions::PyValueError::new_err(
format!(
"Field `{}` is not set as indexed in the schema.",
default_field_name
"Field `{default_field_name}` is not set as indexed in the schema."
),
));
}
default_fields.push(field);
} else {
return Err(exceptions::PyValueError::new_err(format!(
"Field `{}` is not defined in the schema.",
default_field_name
"Field `{default_field_name}` is not defined in the schema."
)));
}
}
@ -355,3 +361,35 @@ impl Index {
Ok(Query { inner: query })
}
}
impl Index {
fn register_custom_text_analyzers(index: &tv::Index) {
let analyzers = [
("ar_stem", Language::Arabic),
("da_stem", Language::Danish),
("nl_stem", Language::Dutch),
("fi_stem", Language::Finnish),
("fr_stem", Language::French),
("de_stem", Language::German),
("el_stem", Language::Greek),
("hu_stem", Language::Hungarian),
("it_stem", Language::Italian),
("no_stem", Language::Norwegian),
("pt_stem", Language::Portuguese),
("ro_stem", Language::Romanian),
("ru_stem", Language::Russian),
("es_stem", Language::Spanish),
("sv_stem", Language::Swedish),
("ta_stem", Language::Tamil),
("tr_stem", Language::Turkish),
];
for (name, lang) in &analyzers {
let an = TextAnalyzer::from(SimpleTokenizer)
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(Stemmer::new(*lang));
index.tokenizers().register(name, an);
}
}
}

View File

@ -1,5 +1,5 @@
use ::tantivy as tv;
use pyo3::{exceptions, prelude::*};
use tantivy as tv;
mod document;
mod facet;
@ -88,8 +88,7 @@ pub(crate) fn get_field(
) -> PyResult<tv::schema::Field> {
let field = schema.get_field(field_name).ok_or_else(|| {
exceptions::PyValueError::new_err(format!(
"Field `{}` is not defined in the schema.",
field_name
"Field `{field_name}` is not defined in the schema."
))
})?;

View File

@ -6,7 +6,7 @@ use tantivy::schema;
use crate::schema::Schema;
use std::sync::{Arc, RwLock};
use tantivy::schema::INDEXED;
use tantivy::schema::{DateOptions, INDEXED};
/// Tantivy has a very strict schema.
/// You need to specify in advance whether a field is indexed or not,
@ -60,11 +60,12 @@ impl SchemaBuilder {
///
/// Returns the associated field handle.
/// Raises a ValueError if there was an error with the field creation.
#[args(
#[pyo3(signature = (
name,
stored = false,
tokenizer_name = "TOKENIZER",
index_option = "RECORD"
)]
tokenizer_name = TOKENIZER,
index_option = RECORD
))]
fn add_text_field(
&mut self,
name: &str,
@ -109,7 +110,7 @@ impl SchemaBuilder {
///
/// Returns the associated field handle.
/// Raises a ValueError if there was an error with the field creation.
#[args(stored = false, indexed = false)]
#[pyo3(signature = (name, stored = false, indexed = false, fast = None))]
fn add_integer_field(
&mut self,
name: &str,
@ -151,7 +152,7 @@ impl SchemaBuilder {
///
/// Returns the associated field handle.
/// Raises a ValueError if there was an error with the field creation.
#[args(stored = false, indexed = false)]
#[pyo3(signature = (name, stored = false, indexed = false, fast = None))]
fn add_unsigned_field(
&mut self,
name: &str,
@ -185,7 +186,7 @@ impl SchemaBuilder {
/// field. Fast fields are designed for random access. Access time
/// are similar to a random lookup in an array. If more than one
/// value is associated to a fast field, only the last one is kept.
/// Can be one of 'single' or 'multi'. If this is set to 'single,
/// Can be one of 'single' or 'multi'. If this is set to 'single',
/// the document must have exactly one value associated to the
/// document. If this is set to 'multi', the document can have any
/// number of values associated to the document. Defaults to None,
@ -193,7 +194,7 @@ impl SchemaBuilder {
///
/// Returns the associated field handle.
/// Raises a ValueError if there was an error with the field creation.
#[args(stored = false, indexed = false)]
#[pyo3(signature = (name, stored = false, indexed = false, fast = None))]
fn add_date_field(
&mut self,
name: &str,
@ -203,7 +204,29 @@ impl SchemaBuilder {
) -> PyResult<Self> {
let builder = &mut self.builder;
let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?;
let mut opts = DateOptions::default();
if stored {
opts = opts.set_stored();
}
if indexed {
opts = opts.set_indexed();
}
let fast = match fast {
Some(f) => {
let f = f.to_lowercase();
match f.as_ref() {
"single" => Some(schema::Cardinality::SingleValue),
"multi" => Some(schema::Cardinality::MultiValues),
_ => return Err(exceptions::PyValueError::new_err(
"Invalid index option, valid choices are: 'multi' and 'single'"
)),
}
}
None => None,
};
if let Some(f) = fast {
opts = opts.set_fast(f);
}
if let Some(builder) = builder.write().unwrap().as_mut() {
builder.add_date_field(name, opts);
@ -234,11 +257,12 @@ impl SchemaBuilder {
///
/// Returns the associated field handle.
/// Raises a ValueError if there was an error with the field creation.
#[args(
#[pyo3(signature = (
name,
stored = false,
tokenizer_name = "TOKENIZER",
index_option = "RECORD"
)]
tokenizer_name = TOKENIZER,
index_option = RECORD
))]
fn add_json_field(
&mut self,
name: &str,

View File

@ -10,7 +10,7 @@ use tantivy::collector::{Count, MultiCollector, TopDocs};
/// A Searcher is used to search the index given a prepared Query.
#[pyclass]
pub(crate) struct Searcher {
pub(crate) inner: tv::LeasedItem<tv::Searcher>,
pub(crate) inner: tv::Searcher,
}
#[derive(Clone)]
@ -22,8 +22,8 @@ enum Fruit {
impl std::fmt::Debug for Fruit {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Fruit::Score(s) => f.write_str(&format!("{}", s)),
Fruit::Order(o) => f.write_str(&format!("{}", o)),
Fruit::Score(s) => f.write_str(&format!("{s}")),
Fruit::Order(o) => f.write_str(&format!("{o}")),
}
}
}
@ -93,7 +93,7 @@ impl Searcher {
/// Returns `SearchResult` object.
///
/// Raises a ValueError if there was an error with the search.
#[args(limit = 10, offset = 0, count = true)]
#[pyo3(signature = (query, limit = 10, count = true, order_by_field = None, offset = 0))]
fn search(
&self,
_py: Python,
@ -154,10 +154,7 @@ impl Searcher {
}
};
let count = match count_handle {
Some(h) => Some(h.extract(&mut multifruit)),
None => None,
};
let count = count_handle.map(|h| h.extract(&mut multifruit));
Ok(SearchResult { hits, count })
}
@ -230,11 +227,11 @@ impl From<&tv::DocAddress> for DocAddress {
}
}
impl Into<tv::DocAddress> for &DocAddress {
fn into(self) -> tv::DocAddress {
impl From<&DocAddress> for tv::DocAddress {
fn from(val: &DocAddress) -> Self {
tv::DocAddress {
segment_ord: self.segment_ord(),
doc_id: self.doc(),
segment_ord: val.segment_ord(),
doc_id: val.doc(),
}
}
}

1
tantivy/__init__.py Normal file
View File

@ -0,0 +1 @@
from .tantivy import *

View File

@ -67,6 +67,56 @@ def create_index(dir=None):
return index
def spanish_schema():
return (
SchemaBuilder()
.add_text_field("title", stored=True, tokenizer_name='es_stem')
.add_text_field("body", tokenizer_name='es_stem')
.build()
)
def create_spanish_index():
# assume all tests will use the same documents for now
# other methods may set up function-local indexes
index = Index(spanish_schema(), None)
writer = index.writer()
# 2 ways of adding documents
# 1
doc = Document()
# create a document instance
# add field-value pairs
doc.add_text("title", "El viejo y el mar")
doc.add_text(
"body",
(
"Era un viejo que pescaba solo en un bote en el Gulf Stream y hacía ochenta y cuatro días que no cogía un pez. "
),
)
writer.add_document(doc)
# 2 use the built-in json support
# keys need to coincide with field names
doc = Document.from_dict(
{
"title": "De ratones y hombres",
"body": (
"Unas millas al sur de Soledad, el río Salinas se ahonda junto al margen de la ladera y fluye profundo y verde. Es tibia el agua, porque se ha deslizado chispeante sobre la arena amarilla y al calor del sol antes de llegar a la angosta laguna. A un lado del río, la dorada falda de la ladera se curva hacia arriba trepando hasta las montañas Gabilán, fuertes y rocosas, pero del lado del valle los árboles bordean la orilla: sauces frescos y verdes cada primavera, que en la s junturas más bajas de sus hojas muestran las consecuencias de la crecida invernal; y sicomoros de troncos veteados, blancos, recostados, y ramas quesear quean sobre el estanque"
),
}
)
writer.add_document(doc)
writer.add_json(
"""{
"title": ["Frankenstein", "El moderno Prometeo"],
"body": "Te alegrará saber que no ha ocurrido ningún percance al principio de una aventura que siempre consideraste cargada de malos presagios. Llegué aquí ayer, y mi primera tarea es asegurarle a mi querida hermana que me hallo perfectamente y que tengo una gran confianza en el éxito de mi empresa."
}"""
)
writer.commit()
index.reload()
return index
@pytest.fixture()
def dir_index(tmpdir):
return (tmpdir, create_index(str(tmpdir)))
@ -77,6 +127,11 @@ def ram_index():
return create_index()
@pytest.fixture(scope="class")
def spanish_index():
return create_spanish_index()
class TestClass(object):
def test_simple_search_in_dir(self, dir_index):
_, index = dir_index
@ -103,6 +158,16 @@ class TestClass(object):
searched_doc = index.searcher().doc(doc_address)
assert searched_doc["title"] == ["The Old Man and the Sea"]
def test_simple_search_in_spanish(self, spanish_index):
index = spanish_index
query = index.parse_query("vieja", ["title", "body"])
result = index.searcher().search(query, 10)
assert len(result.hits) == 1
_, doc_address = result.hits[0]
search_doc = index.searcher().doc(doc_address)
assert search_doc["title"] == ["El viejo y el mar"]
def test_and_query(self, ram_index):
index = ram_index
query = index.parse_query(