Merge branch 'master' into add_float_support
commit
4ffdffe4d6
|
@ -18,7 +18,7 @@ jobs:
|
|||
toolchain: stable
|
||||
components: rustfmt
|
||||
- name: Check Formatting
|
||||
run: rustfmt --check src/*rs
|
||||
run: cargo fmt --check
|
||||
|
||||
Test:
|
||||
strategy:
|
||||
|
@ -26,6 +26,8 @@ jobs:
|
|||
os: [ubuntu-latest, macos-latest, windows-latest]
|
||||
python-version: [3.9]
|
||||
include:
|
||||
- os: ubuntu-latest
|
||||
python-version: "3.11"
|
||||
- os: ubuntu-latest
|
||||
python-version: "3.10"
|
||||
- os: ubuntu-latest
|
||||
|
|
|
@ -3,9 +3,9 @@
|
|||
build
|
||||
/target
|
||||
**/*.rs.bk
|
||||
Cargo.lock
|
||||
dist/
|
||||
__pycache__/
|
||||
tantivy.so
|
||||
tantivy.dylib
|
||||
tantivy/tantivy.cpython*.so
|
||||
tantivy.egg-info/
|
||||
|
|
File diff suppressed because it is too large
Load Diff
16
Cargo.toml
16
Cargo.toml
|
@ -1,6 +1,6 @@
|
|||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.17.0"
|
||||
version = "0.19.2"
|
||||
readme = "README.md"
|
||||
authors = ["Damir Jelić <poljar@termina.org.uk>"]
|
||||
edition = "2018"
|
||||
|
@ -11,15 +11,15 @@ name = "tantivy"
|
|||
crate-type = ["cdylib"]
|
||||
|
||||
[build-dependencies]
|
||||
pyo3-build-config = "0.16.3"
|
||||
pyo3-build-config = "0.18.0"
|
||||
|
||||
[dependencies]
|
||||
chrono = "0.4.19"
|
||||
tantivy = "0.17"
|
||||
itertools = "0.10.3"
|
||||
futures = "0.3.21"
|
||||
serde_json = "1.0.64"
|
||||
chrono = "0.4.23"
|
||||
tantivy = "0.19.2"
|
||||
itertools = "0.10.5"
|
||||
futures = "0.3.26"
|
||||
serde_json = "1.0.91"
|
||||
|
||||
[dependencies.pyo3]
|
||||
version = "0.16.3"
|
||||
version = "0.18.0"
|
||||
features = ["extension-module"]
|
12
Makefile
12
Makefile
|
@ -1,4 +1,4 @@
|
|||
ifeq ($(shell UNAME),Darwin)
|
||||
ifeq ($(shell uname),Darwin)
|
||||
EXT := dylib
|
||||
else
|
||||
EXT := so
|
||||
|
@ -6,15 +6,21 @@ endif
|
|||
|
||||
source_files := $(wildcard src/*.rs)
|
||||
|
||||
all: tantivy/tantivy.$(EXT)
|
||||
all: format lint build test
|
||||
|
||||
PHONY: test format
|
||||
|
||||
lint:
|
||||
cargo clippy
|
||||
|
||||
test: tantivy/tantivy.$(EXT)
|
||||
python3 -m pytest
|
||||
|
||||
format:
|
||||
rustfmt src/*.rs
|
||||
cargo fmt
|
||||
|
||||
build:
|
||||
maturin build --interpreter python3.7 python3.8 python3.9 python3.10 python3.11
|
||||
|
||||
tantivy/tantivy.$(EXT): target/debug/libtantivy.$(EXT)
|
||||
cp target/debug/libtantivy.$(EXT) tantivy/tantivy.so
|
||||
|
|
93
README.md
93
README.md
|
@ -4,7 +4,7 @@
|
|||
tantivy-py
|
||||
==========
|
||||
|
||||
Python bindings for Tantivy.
|
||||
Python bindings for [Tantivy](https://github.com/quickwit-oss/tantivy) the full-text search engine library written in Rust.
|
||||
|
||||
|
||||
# Installation
|
||||
|
@ -44,6 +44,8 @@ The Python bindings have a similar API to Tantivy. To create a index first a sch
|
|||
needs to be built. After that documents can be added to the index and a reader
|
||||
can be created to search the index.
|
||||
|
||||
## Building an index and populating it
|
||||
|
||||
```python
|
||||
import tantivy
|
||||
|
||||
|
@ -51,29 +53,112 @@ import tantivy
|
|||
schema_builder = tantivy.SchemaBuilder()
|
||||
schema_builder.add_text_field("title", stored=True)
|
||||
schema_builder.add_text_field("body", stored=True)
|
||||
schema_builder.add_integer_field("doc_id",stored=True)
|
||||
schema = schema_builder.build()
|
||||
|
||||
# Creating our index (in memory, but filesystem is available too)
|
||||
# Creating our index (in memory)
|
||||
index = tantivy.Index(schema)
|
||||
```
|
||||
|
||||
To have a persistent index, use the path
|
||||
parameter to store the index on the disk, e.g:
|
||||
|
||||
# Adding one document.
|
||||
```python
|
||||
index = tantivy.Index(schema, path=os.getcwd() + '/index')
|
||||
```
|
||||
|
||||
By default, tantivy offers the following tokenizers
|
||||
which can be used in tantivy-py:
|
||||
- `default`
|
||||
`default` is the tokenizer that will be used if you do not
|
||||
assign a specific tokenizer to your text field.
|
||||
It will chop your text on punctuation and whitespaces,
|
||||
removes tokens that are longer than 40 chars, and lowercase your text.
|
||||
|
||||
- `raw`
|
||||
Does not actual tokenizer your text. It keeps it entirely unprocessed.
|
||||
It can be useful to index uuids, or urls for instance.
|
||||
|
||||
- `en_stem`
|
||||
|
||||
In addition to what `default` does, the `en_stem` tokenizer also
|
||||
apply stemming to your tokens. Stemming consists in trimming words to
|
||||
remove their inflection. This tokenizer is slower than the default one,
|
||||
but is recommended to improve recall.
|
||||
|
||||
to use the above tokenizers, simply provide them as a parameter to `add_text_field`. e.g.
|
||||
```python
|
||||
schema_builder.add_text_field("body", stored=True, tokenizer_name='en_stem')
|
||||
```
|
||||
|
||||
### Adding one document.
|
||||
|
||||
```python
|
||||
writer = index.writer()
|
||||
writer.add_document(tantivy.Document(
|
||||
doc_id=1,
|
||||
title=["The Old Man and the Sea"],
|
||||
body=["""He was an old man who fished alone in a skiff in the Gulf Stream and he had gone eighty-four days now without taking a fish."""],
|
||||
))
|
||||
# ... and committing
|
||||
writer.commit()
|
||||
```
|
||||
|
||||
|
||||
## Building and Executing Queries
|
||||
|
||||
First you need to get a searcher for the index
|
||||
|
||||
```python
|
||||
# Reload the index to ensure it points to the last commit.
|
||||
index.reload()
|
||||
searcher = index.searcher()
|
||||
query = index.parse_query("fish days", ["title", "body"])
|
||||
```
|
||||
|
||||
Then you need to get a valid query object by parsing your query on the index.
|
||||
|
||||
```python
|
||||
query = index.parse_query("fish days", ["title", "body"])
|
||||
(best_score, best_doc_address) = searcher.search(query, 3).hits[0]
|
||||
best_doc = searcher.doc(best_doc_address)
|
||||
assert best_doc["title"] == ["The Old Man and the Sea"]
|
||||
print(best_doc)
|
||||
```
|
||||
|
||||
### Valid Query Formats
|
||||
|
||||
tantivy-py supports the query language used in tantivy.
|
||||
Some basic query Formats.
|
||||
|
||||
|
||||
- AND and OR conjunctions.
|
||||
```python
|
||||
query = index.parse_query('(Old AND Man) OR Stream', ["title", "body"])
|
||||
(best_score, best_doc_address) = searcher.search(query, 3).hits[0]
|
||||
best_doc = searcher.doc(best_doc_address)
|
||||
```
|
||||
|
||||
- +(includes) and -(excludes) operators.
|
||||
```python
|
||||
query = index.parse_query('+Old +Man chef -fished', ["title", "body"])
|
||||
(best_score, best_doc_address) = searcher.search(query, 3).hits[0]
|
||||
best_doc = searcher.doc(best_doc_address)
|
||||
```
|
||||
Note: in a query like above, a word with no +/- acts like an OR.
|
||||
|
||||
- phrase search.
|
||||
```python
|
||||
query = index.parse_query('"eighty-four days"', ["title", "body"])
|
||||
(best_score, best_doc_address) = searcher.search(query, 3).hits[0]
|
||||
best_doc = searcher.doc(best_doc_address)
|
||||
```
|
||||
|
||||
- integer search
|
||||
```python
|
||||
query = index.parse_query('"eighty-four days"', ["doc_id"])
|
||||
(best_score, best_doc_address) = searcher.search(query, 3).hits[0]
|
||||
best_doc = searcher.doc(best_doc_address)
|
||||
```
|
||||
Note: for integer search, the integer field should be indexed.
|
||||
|
||||
For more possible query formats and possible query options, see [Tantivy Query Parser Docs.](https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html)
|
|
@ -1,7 +1,7 @@
|
|||
import nox
|
||||
|
||||
|
||||
@nox.session(python=["3.7", "3.8", "3.9", "3.10"])
|
||||
@nox.session(python=["3.7", "3.8", "3.9", "3.10", "3.11"])
|
||||
def test(session):
|
||||
session.install("-rrequirements-dev.txt")
|
||||
session.install("-e", ".", "--no-build-isolation")
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
[build-system]
|
||||
requires = ["maturin"]
|
||||
requires = ["maturin>=0.13,<0.14"]
|
||||
build-backend = "maturin"
|
||||
|
||||
[project]
|
||||
|
|
|
@ -9,7 +9,7 @@ use pyo3::{
|
|||
},
|
||||
};
|
||||
|
||||
use chrono::{offset::TimeZone, Datelike, Timelike, Utc};
|
||||
use chrono::{offset::TimeZone, Utc};
|
||||
|
||||
use tantivy as tv;
|
||||
|
||||
|
@ -56,37 +56,42 @@ fn value_to_py(py: Python, value: &Value) -> PyResult<PyObject> {
|
|||
// TODO implement me
|
||||
unimplemented!();
|
||||
}
|
||||
Value::Date(d) => PyDateTime::new(
|
||||
Value::Date(d) => {
|
||||
let utc = d.into_utc();
|
||||
PyDateTime::new(
|
||||
py,
|
||||
d.year(),
|
||||
d.month() as u8,
|
||||
d.day() as u8,
|
||||
d.hour() as u8,
|
||||
d.minute() as u8,
|
||||
d.second() as u8,
|
||||
d.timestamp_subsec_micros(),
|
||||
utc.year(),
|
||||
utc.month() as u8,
|
||||
utc.day(),
|
||||
utc.hour(),
|
||||
utc.minute(),
|
||||
utc.second(),
|
||||
utc.microsecond(),
|
||||
None,
|
||||
)?
|
||||
.into_py(py),
|
||||
.into_py(py)
|
||||
}
|
||||
Value::Facet(f) => Facet { inner: f.clone() }.into_py(py),
|
||||
Value::JsonObject(json_object) => {
|
||||
let inner: HashMap<_, _> = json_object
|
||||
.iter()
|
||||
.map(|(k, v)| (k, value_to_object(&v, py)))
|
||||
.map(|(k, v)| (k, value_to_object(v, py)))
|
||||
.collect();
|
||||
inner.to_object(py)
|
||||
}
|
||||
Value::Bool(b) => b.into_py(py),
|
||||
Value::IpAddr(i) => (*i).to_string().into_py(py),
|
||||
})
|
||||
}
|
||||
|
||||
fn value_to_string(value: &Value) -> String {
|
||||
match value {
|
||||
Value::Str(text) => text.clone(),
|
||||
Value::U64(num) => format!("{}", num),
|
||||
Value::I64(num) => format!("{}", num),
|
||||
Value::F64(num) => format!("{}", num),
|
||||
Value::Bytes(bytes) => format!("{:?}", bytes),
|
||||
Value::Date(d) => format!("{:?}", d),
|
||||
Value::U64(num) => format!("{num}"),
|
||||
Value::I64(num) => format!("{num}"),
|
||||
Value::F64(num) => format!("{num}"),
|
||||
Value::Bytes(bytes) => format!("{bytes:?}"),
|
||||
Value::Date(d) => format!("{d:?}"),
|
||||
Value::Facet(facet) => facet.to_string(),
|
||||
Value::PreTokStr(_pretok) => {
|
||||
// TODO implement me
|
||||
|
@ -95,6 +100,8 @@ fn value_to_string(value: &Value) -> String {
|
|||
Value::JsonObject(json_object) => {
|
||||
serde_json::to_string(&json_object).unwrap()
|
||||
}
|
||||
Value::Bool(b) => format!("{b}"),
|
||||
Value::IpAddr(i) => format!("{}", *i),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -141,10 +148,10 @@ impl fmt::Debug for Document {
|
|||
.chars()
|
||||
.take(10)
|
||||
.collect();
|
||||
format!("{}=[{}]", field_name, values_str)
|
||||
format!("{field_name}=[{values_str}]")
|
||||
})
|
||||
.join(",");
|
||||
write!(f, "Document({})", doc_str)
|
||||
write!(f, "Document({doc_str})")
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -170,23 +177,24 @@ pub(crate) fn extract_value(any: &PyAny) -> PyResult<Value> {
|
|||
}
|
||||
if let Ok(py_datetime) = any.downcast::<PyDateTime>() {
|
||||
let datetime = Utc
|
||||
.ymd(
|
||||
.with_ymd_and_hms(
|
||||
py_datetime.get_year(),
|
||||
py_datetime.get_month().into(),
|
||||
py_datetime.get_day().into(),
|
||||
)
|
||||
.and_hms_micro(
|
||||
py_datetime.get_hour().into(),
|
||||
py_datetime.get_minute().into(),
|
||||
py_datetime.get_second().into(),
|
||||
py_datetime.get_microsecond(),
|
||||
);
|
||||
return Ok(Value::Date(datetime));
|
||||
)
|
||||
.single()
|
||||
.unwrap();
|
||||
return Ok(Value::Date(tv::DateTime::from_timestamp_secs(
|
||||
datetime.timestamp(),
|
||||
)));
|
||||
}
|
||||
if let Ok(facet) = any.extract::<Facet>() {
|
||||
return Ok(Value::Facet(facet.inner.clone()));
|
||||
return Ok(Value::Facet(facet.inner));
|
||||
}
|
||||
Err(to_pyerr(format!("Value unsupported {:?}", any)))
|
||||
Err(to_pyerr(format!("Value unsupported {any:?}")))
|
||||
}
|
||||
|
||||
fn extract_value_single_or_list(any: &PyAny) -> PyResult<Vec<Value>> {
|
||||
|
@ -200,7 +208,7 @@ fn extract_value_single_or_list(any: &PyAny) -> PyResult<Vec<Value>> {
|
|||
#[pymethods]
|
||||
impl Document {
|
||||
#[new]
|
||||
#[args(kwargs = "**")]
|
||||
#[pyo3(signature = (**kwargs))]
|
||||
fn new(kwargs: Option<&PyDict>) -> PyResult<Self> {
|
||||
let mut document = Document::default();
|
||||
if let Some(field_dict) = kwargs {
|
||||
|
@ -308,18 +316,21 @@ impl Document {
|
|||
/// value (datetime): The date that will be added to the document.
|
||||
fn add_date(&mut self, field_name: String, value: &PyDateTime) {
|
||||
let datetime = Utc
|
||||
.ymd(
|
||||
.with_ymd_and_hms(
|
||||
value.get_year(),
|
||||
value.get_month().into(),
|
||||
value.get_day().into(),
|
||||
)
|
||||
.and_hms_micro(
|
||||
value.get_hour().into(),
|
||||
value.get_minute().into(),
|
||||
value.get_second().into(),
|
||||
value.get_microsecond(),
|
||||
)
|
||||
.single()
|
||||
.unwrap();
|
||||
add_value(
|
||||
self,
|
||||
field_name,
|
||||
tv::DateTime::from_timestamp_secs(datetime.timestamp()),
|
||||
);
|
||||
add_value(self, field_name, datetime);
|
||||
}
|
||||
|
||||
/// Add a facet value to the document.
|
||||
|
@ -396,13 +407,13 @@ impl Document {
|
|||
}
|
||||
|
||||
fn __getitem__(&self, field_name: &str) -> PyResult<Vec<PyObject>> {
|
||||
let gil = Python::acquire_gil();
|
||||
let py = gil.python();
|
||||
Python::with_gil(|py| -> PyResult<Vec<PyObject>> {
|
||||
self.get_all(py, field_name)
|
||||
})
|
||||
}
|
||||
|
||||
fn __repr__(&self) -> PyResult<String> {
|
||||
Ok(format!("{:?}", self))
|
||||
Ok(format!("{self:?}"))
|
||||
}
|
||||
}
|
||||
|
||||
|
|
78
src/index.rs
78
src/index.rs
|
@ -14,6 +14,10 @@ use tantivy as tv;
|
|||
use tantivy::{
|
||||
directory::MmapDirectory,
|
||||
schema::{NamedFieldDocument, Term, Value},
|
||||
tokenizer::{
|
||||
Language, LowerCaser, RemoveLongFilter, SimpleTokenizer, Stemmer,
|
||||
TextAnalyzer,
|
||||
},
|
||||
};
|
||||
|
||||
const RELOAD_POLICY: &str = "commit";
|
||||
|
@ -120,26 +124,25 @@ impl IndexWriter {
|
|||
Value::U64(num) => Term::from_field_u64(field, num),
|
||||
Value::I64(num) => Term::from_field_i64(field, num),
|
||||
Value::F64(num) => Term::from_field_f64(field, num),
|
||||
Value::Date(d) => Term::from_field_date(field, &d),
|
||||
Value::Date(d) => Term::from_field_date(field, d),
|
||||
Value::Facet(facet) => Term::from_facet(field, &facet),
|
||||
Value::Bytes(_) => {
|
||||
return Err(exceptions::PyValueError::new_err(format!(
|
||||
"Field `{}` is bytes type not deletable.",
|
||||
field_name
|
||||
"Field `{field_name}` is bytes type not deletable."
|
||||
)))
|
||||
}
|
||||
Value::PreTokStr(_pretok) => {
|
||||
return Err(exceptions::PyValueError::new_err(format!(
|
||||
"Field `{}` is pretokenized. This is not authorized for delete.",
|
||||
field_name
|
||||
"Field `{field_name}` is pretokenized. This is not authorized for delete."
|
||||
)))
|
||||
}
|
||||
Value::JsonObject(_) => {
|
||||
return Err(exceptions::PyValueError::new_err(format!(
|
||||
"Field `{}` is json object type not deletable.",
|
||||
field_name
|
||||
"Field `{field_name}` is json object type not deletable."
|
||||
)))
|
||||
}
|
||||
},
|
||||
Value::Bool(b) => Term::from_field_bool(field, b),
|
||||
Value::IpAddr(i) => Term::from_field_ip_addr(field, i)
|
||||
};
|
||||
Ok(self.inner_index_writer.delete_term(term))
|
||||
}
|
||||
|
@ -167,12 +170,15 @@ impl Index {
|
|||
#[staticmethod]
|
||||
fn open(path: &str) -> PyResult<Index> {
|
||||
let index = tv::Index::open_in_dir(path).map_err(to_pyerr)?;
|
||||
|
||||
Index::register_custom_text_analyzers(&index);
|
||||
|
||||
let reader = index.reader().map_err(to_pyerr)?;
|
||||
Ok(Index { index, reader })
|
||||
}
|
||||
|
||||
#[new]
|
||||
#[args(reuse = true)]
|
||||
#[pyo3(signature = (schema, path = None, reuse = true))]
|
||||
fn new(schema: &Schema, path: Option<&str>, reuse: bool) -> PyResult<Self> {
|
||||
let index = match path {
|
||||
Some(p) => {
|
||||
|
@ -191,6 +197,8 @@ impl Index {
|
|||
None => tv::Index::create_in_ram(schema.inner.clone()),
|
||||
};
|
||||
|
||||
Index::register_custom_text_analyzers(&index);
|
||||
|
||||
let reader = index.reader().map_err(to_pyerr)?;
|
||||
Ok(Index { index, reader })
|
||||
}
|
||||
|
@ -208,7 +216,7 @@ impl Index {
|
|||
/// automatically the number of threads.
|
||||
///
|
||||
/// Raises ValueError if there was an error while creating the writer.
|
||||
#[args(heap_size = 3000000, num_threads = 0)]
|
||||
#[pyo3(signature = (heap_size = 3000000, num_threads = 0))]
|
||||
fn writer(
|
||||
&self,
|
||||
heap_size: usize,
|
||||
|
@ -231,13 +239,13 @@ impl Index {
|
|||
/// Args:
|
||||
/// reload_policy (str, optional): The reload policy that the
|
||||
/// IndexReader should use. Can be `Manual` or `OnCommit`.
|
||||
/// num_searchers (int, optional): The number of searchers that the
|
||||
/// num_warmers (int, optional): The number of searchers that the
|
||||
/// reader should create.
|
||||
#[args(reload_policy = "RELOAD_POLICY", num_searchers = 0)]
|
||||
#[pyo3(signature = (reload_policy = RELOAD_POLICY, num_warmers = 0))]
|
||||
fn config_reader(
|
||||
&mut self,
|
||||
reload_policy: &str,
|
||||
num_searchers: usize,
|
||||
num_warmers: usize,
|
||||
) -> Result<(), PyErr> {
|
||||
let reload_policy = reload_policy.to_lowercase();
|
||||
let reload_policy = match reload_policy.as_ref() {
|
||||
|
@ -251,8 +259,8 @@ impl Index {
|
|||
};
|
||||
let builder = self.index.reader_builder();
|
||||
let builder = builder.reload_policy(reload_policy);
|
||||
let builder = if num_searchers > 0 {
|
||||
builder.num_searchers(num_searchers)
|
||||
let builder = if num_warmers > 0 {
|
||||
builder.num_warming_threads(num_warmers)
|
||||
} else {
|
||||
builder
|
||||
};
|
||||
|
@ -313,7 +321,7 @@ impl Index {
|
|||
/// default_fields_names (List[Field]): A list of fields used to search if no
|
||||
/// field is specified in the query.
|
||||
///
|
||||
#[args(reload_policy = "RELOAD_POLICY")]
|
||||
#[pyo3(signature = (query, default_field_names = None))]
|
||||
pub fn parse_query(
|
||||
&self,
|
||||
query: &str,
|
||||
|
@ -328,16 +336,14 @@ impl Index {
|
|||
if !field_entry.is_indexed() {
|
||||
return Err(exceptions::PyValueError::new_err(
|
||||
format!(
|
||||
"Field `{}` is not set as indexed in the schema.",
|
||||
default_field_name
|
||||
"Field `{default_field_name}` is not set as indexed in the schema."
|
||||
),
|
||||
));
|
||||
}
|
||||
default_fields.push(field);
|
||||
} else {
|
||||
return Err(exceptions::PyValueError::new_err(format!(
|
||||
"Field `{}` is not defined in the schema.",
|
||||
default_field_name
|
||||
"Field `{default_field_name}` is not defined in the schema."
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
@ -355,3 +361,35 @@ impl Index {
|
|||
Ok(Query { inner: query })
|
||||
}
|
||||
}
|
||||
|
||||
impl Index {
|
||||
fn register_custom_text_analyzers(index: &tv::Index) {
|
||||
let analyzers = [
|
||||
("ar_stem", Language::Arabic),
|
||||
("da_stem", Language::Danish),
|
||||
("nl_stem", Language::Dutch),
|
||||
("fi_stem", Language::Finnish),
|
||||
("fr_stem", Language::French),
|
||||
("de_stem", Language::German),
|
||||
("el_stem", Language::Greek),
|
||||
("hu_stem", Language::Hungarian),
|
||||
("it_stem", Language::Italian),
|
||||
("no_stem", Language::Norwegian),
|
||||
("pt_stem", Language::Portuguese),
|
||||
("ro_stem", Language::Romanian),
|
||||
("ru_stem", Language::Russian),
|
||||
("es_stem", Language::Spanish),
|
||||
("sv_stem", Language::Swedish),
|
||||
("ta_stem", Language::Tamil),
|
||||
("tr_stem", Language::Turkish),
|
||||
];
|
||||
|
||||
for (name, lang) in &analyzers {
|
||||
let an = TextAnalyzer::from(SimpleTokenizer)
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(Stemmer::new(*lang));
|
||||
index.tokenizers().register(name, an);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
use ::tantivy as tv;
|
||||
use pyo3::{exceptions, prelude::*};
|
||||
use tantivy as tv;
|
||||
|
||||
mod document;
|
||||
mod facet;
|
||||
|
@ -88,8 +88,7 @@ pub(crate) fn get_field(
|
|||
) -> PyResult<tv::schema::Field> {
|
||||
let field = schema.get_field(field_name).ok_or_else(|| {
|
||||
exceptions::PyValueError::new_err(format!(
|
||||
"Field `{}` is not defined in the schema.",
|
||||
field_name
|
||||
"Field `{field_name}` is not defined in the schema."
|
||||
))
|
||||
})?;
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ use tantivy::schema;
|
|||
|
||||
use crate::schema::Schema;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use tantivy::schema::INDEXED;
|
||||
use tantivy::schema::{DateOptions, INDEXED};
|
||||
|
||||
/// Tantivy has a very strict schema.
|
||||
/// You need to specify in advance whether a field is indexed or not,
|
||||
|
@ -60,11 +60,12 @@ impl SchemaBuilder {
|
|||
///
|
||||
/// Returns the associated field handle.
|
||||
/// Raises a ValueError if there was an error with the field creation.
|
||||
#[args(
|
||||
#[pyo3(signature = (
|
||||
name,
|
||||
stored = false,
|
||||
tokenizer_name = "TOKENIZER",
|
||||
index_option = "RECORD"
|
||||
)]
|
||||
tokenizer_name = TOKENIZER,
|
||||
index_option = RECORD
|
||||
))]
|
||||
fn add_text_field(
|
||||
&mut self,
|
||||
name: &str,
|
||||
|
@ -109,7 +110,7 @@ impl SchemaBuilder {
|
|||
///
|
||||
/// Returns the associated field handle.
|
||||
/// Raises a ValueError if there was an error with the field creation.
|
||||
#[args(stored = false, indexed = false)]
|
||||
#[pyo3(signature = (name, stored = false, indexed = false, fast = None))]
|
||||
fn add_integer_field(
|
||||
&mut self,
|
||||
name: &str,
|
||||
|
@ -173,7 +174,7 @@ impl SchemaBuilder {
|
|||
///
|
||||
/// Returns the associated field handle.
|
||||
/// Raises a ValueError if there was an error with the field creation.
|
||||
#[args(stored = false, indexed = false)]
|
||||
#[pyo3(signature = (name, stored = false, indexed = false, fast = None))]
|
||||
fn add_unsigned_field(
|
||||
&mut self,
|
||||
name: &str,
|
||||
|
@ -207,7 +208,7 @@ impl SchemaBuilder {
|
|||
/// field. Fast fields are designed for random access. Access time
|
||||
/// are similar to a random lookup in an array. If more than one
|
||||
/// value is associated to a fast field, only the last one is kept.
|
||||
/// Can be one of 'single' or 'multi'. If this is set to 'single,
|
||||
/// Can be one of 'single' or 'multi'. If this is set to 'single',
|
||||
/// the document must have exactly one value associated to the
|
||||
/// document. If this is set to 'multi', the document can have any
|
||||
/// number of values associated to the document. Defaults to None,
|
||||
|
@ -215,7 +216,7 @@ impl SchemaBuilder {
|
|||
///
|
||||
/// Returns the associated field handle.
|
||||
/// Raises a ValueError if there was an error with the field creation.
|
||||
#[args(stored = false, indexed = false)]
|
||||
#[pyo3(signature = (name, stored = false, indexed = false, fast = None))]
|
||||
fn add_date_field(
|
||||
&mut self,
|
||||
name: &str,
|
||||
|
@ -225,7 +226,29 @@ impl SchemaBuilder {
|
|||
) -> PyResult<Self> {
|
||||
let builder = &mut self.builder;
|
||||
|
||||
let opts = SchemaBuilder::build_numeric_option(stored, indexed, fast)?;
|
||||
let mut opts = DateOptions::default();
|
||||
if stored {
|
||||
opts = opts.set_stored();
|
||||
}
|
||||
if indexed {
|
||||
opts = opts.set_indexed();
|
||||
}
|
||||
let fast = match fast {
|
||||
Some(f) => {
|
||||
let f = f.to_lowercase();
|
||||
match f.as_ref() {
|
||||
"single" => Some(schema::Cardinality::SingleValue),
|
||||
"multi" => Some(schema::Cardinality::MultiValues),
|
||||
_ => return Err(exceptions::PyValueError::new_err(
|
||||
"Invalid index option, valid choices are: 'multi' and 'single'"
|
||||
)),
|
||||
}
|
||||
}
|
||||
None => None,
|
||||
};
|
||||
if let Some(f) = fast {
|
||||
opts = opts.set_fast(f);
|
||||
}
|
||||
|
||||
if let Some(builder) = builder.write().unwrap().as_mut() {
|
||||
builder.add_date_field(name, opts);
|
||||
|
@ -256,11 +279,12 @@ impl SchemaBuilder {
|
|||
///
|
||||
/// Returns the associated field handle.
|
||||
/// Raises a ValueError if there was an error with the field creation.
|
||||
#[args(
|
||||
#[pyo3(signature = (
|
||||
name,
|
||||
stored = false,
|
||||
tokenizer_name = "TOKENIZER",
|
||||
index_option = "RECORD"
|
||||
)]
|
||||
tokenizer_name = TOKENIZER,
|
||||
index_option = RECORD
|
||||
))]
|
||||
fn add_json_field(
|
||||
&mut self,
|
||||
name: &str,
|
||||
|
|
|
@ -10,7 +10,7 @@ use tantivy::collector::{Count, MultiCollector, TopDocs};
|
|||
/// A Searcher is used to search the index given a prepared Query.
|
||||
#[pyclass]
|
||||
pub(crate) struct Searcher {
|
||||
pub(crate) inner: tv::LeasedItem<tv::Searcher>,
|
||||
pub(crate) inner: tv::Searcher,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
|
@ -22,8 +22,8 @@ enum Fruit {
|
|||
impl std::fmt::Debug for Fruit {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Fruit::Score(s) => f.write_str(&format!("{}", s)),
|
||||
Fruit::Order(o) => f.write_str(&format!("{}", o)),
|
||||
Fruit::Score(s) => f.write_str(&format!("{s}")),
|
||||
Fruit::Order(o) => f.write_str(&format!("{o}")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -93,7 +93,7 @@ impl Searcher {
|
|||
/// Returns `SearchResult` object.
|
||||
///
|
||||
/// Raises a ValueError if there was an error with the search.
|
||||
#[args(limit = 10, offset = 0, count = true)]
|
||||
#[pyo3(signature = (query, limit = 10, count = true, order_by_field = None, offset = 0))]
|
||||
fn search(
|
||||
&self,
|
||||
_py: Python,
|
||||
|
@ -154,10 +154,7 @@ impl Searcher {
|
|||
}
|
||||
};
|
||||
|
||||
let count = match count_handle {
|
||||
Some(h) => Some(h.extract(&mut multifruit)),
|
||||
None => None,
|
||||
};
|
||||
let count = count_handle.map(|h| h.extract(&mut multifruit));
|
||||
|
||||
Ok(SearchResult { hits, count })
|
||||
}
|
||||
|
@ -230,11 +227,11 @@ impl From<&tv::DocAddress> for DocAddress {
|
|||
}
|
||||
}
|
||||
|
||||
impl Into<tv::DocAddress> for &DocAddress {
|
||||
fn into(self) -> tv::DocAddress {
|
||||
impl From<&DocAddress> for tv::DocAddress {
|
||||
fn from(val: &DocAddress) -> Self {
|
||||
tv::DocAddress {
|
||||
segment_ord: self.segment_ord(),
|
||||
doc_id: self.doc(),
|
||||
segment_ord: val.segment_ord(),
|
||||
doc_id: val.doc(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
from .tantivy import *
|
|
@ -115,6 +115,56 @@ def create_index_with_numeric_fields(dir=None):
|
|||
index.reload()
|
||||
return index
|
||||
|
||||
def spanish_schema():
|
||||
return (
|
||||
SchemaBuilder()
|
||||
.add_text_field("title", stored=True, tokenizer_name='es_stem')
|
||||
.add_text_field("body", tokenizer_name='es_stem')
|
||||
.build()
|
||||
)
|
||||
|
||||
|
||||
def create_spanish_index():
|
||||
# assume all tests will use the same documents for now
|
||||
# other methods may set up function-local indexes
|
||||
index = Index(spanish_schema(), None)
|
||||
writer = index.writer()
|
||||
|
||||
# 2 ways of adding documents
|
||||
# 1
|
||||
doc = Document()
|
||||
# create a document instance
|
||||
# add field-value pairs
|
||||
doc.add_text("title", "El viejo y el mar")
|
||||
doc.add_text(
|
||||
"body",
|
||||
(
|
||||
"Era un viejo que pescaba solo en un bote en el Gulf Stream y hacía ochenta y cuatro días que no cogía un pez. "
|
||||
),
|
||||
)
|
||||
writer.add_document(doc)
|
||||
# 2 use the built-in json support
|
||||
# keys need to coincide with field names
|
||||
doc = Document.from_dict(
|
||||
{
|
||||
"title": "De ratones y hombres",
|
||||
"body": (
|
||||
"Unas millas al sur de Soledad, el río Salinas se ahonda junto al margen de la ladera y fluye profundo y verde. Es tibia el agua, porque se ha deslizado chispeante sobre la arena amarilla y al calor del sol antes de llegar a la angosta laguna. A un lado del río, la dorada falda de la ladera se curva hacia arriba trepando hasta las montañas Gabilán, fuertes y rocosas, pero del lado del valle los árboles bordean la orilla: sauces frescos y verdes cada primavera, que en la s junturas más bajas de sus hojas muestran las consecuencias de la crecida invernal; y sicomoros de troncos veteados, blancos, recostados, y ramas quesear quean sobre el estanque"
|
||||
),
|
||||
}
|
||||
)
|
||||
writer.add_document(doc)
|
||||
writer.add_json(
|
||||
"""{
|
||||
"title": ["Frankenstein", "El moderno Prometeo"],
|
||||
"body": "Te alegrará saber que no ha ocurrido ningún percance al principio de una aventura que siempre consideraste cargada de malos presagios. Llegué aquí ayer, y mi primera tarea es asegurarle a mi querida hermana que me hallo perfectamente y que tengo una gran confianza en el éxito de mi empresa."
|
||||
}"""
|
||||
)
|
||||
writer.commit()
|
||||
index.reload()
|
||||
return index
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def dir_index(tmpdir):
|
||||
return (tmpdir, create_index(str(tmpdir)))
|
||||
|
@ -130,6 +180,10 @@ def ram_index_numeric_fields():
|
|||
return create_index_with_numeric_fields()
|
||||
|
||||
|
||||
def spanish_index():
|
||||
return create_spanish_index()
|
||||
|
||||
|
||||
class TestClass(object):
|
||||
def test_simple_search_in_dir(self, dir_index):
|
||||
_, index = dir_index
|
||||
|
@ -156,6 +210,16 @@ class TestClass(object):
|
|||
searched_doc = index.searcher().doc(doc_address)
|
||||
assert searched_doc["title"] == ["The Old Man and the Sea"]
|
||||
|
||||
def test_simple_search_in_spanish(self, spanish_index):
|
||||
index = spanish_index
|
||||
query = index.parse_query("vieja", ["title", "body"])
|
||||
|
||||
result = index.searcher().search(query, 10)
|
||||
assert len(result.hits) == 1
|
||||
_, doc_address = result.hits[0]
|
||||
search_doc = index.searcher().doc(doc_address)
|
||||
assert search_doc["title"] == ["El viejo y el mar"]
|
||||
|
||||
def test_and_query(self, ram_index):
|
||||
index = ram_index
|
||||
query = index.parse_query(
|
||||
|
|
Loading…
Reference in New Issue