Tantivy 0.19.2 (#67)

* Adding __init__.py file to the tantivy folder to make maturin happy Add Cargo.lock to the repo Set the git-fetch-with-cli cargo flag so that we can override fetch settings Renaming .cargo/config to .cargo/config.toml Adding github-quiq-sh cargo registry Point dependencies at our github-quiq-sh registry Trying to resolve this build issue, pointing pyo3-build-config at our github-quiq-sh registry SER-21487: Enable support for all standard Tantivy languages plus Chinese + Japanese in tantivy-py SER-21487: Use uname rather than UNAME in the Makefile SER-21487: Fix document date handling SER-23013: Upgrade Tantivy and other dependencies * Upgrade to Tantivy 0.19.1 * Apply rustfmt and fix bug when fast option = None * Upgrade to tantivy-0.19.2 * Standardize around using 'cargo fmt' rather than 'rustfmt' * Reverting to old style dependencies * Linting with clippy * Switching out hashmap for defining tokenizers for an array, and adding test for Spanish indexing * Use cargo fmt instead of rustfmt on the Lint ci step
2023-02-14 08:20:59 -05:00 · 2023-02-14 08:20:59 -05:00 · 164adc87e1
commit 164adc87e1
parent b2043793ee
15 changed files with 1820 additions and 106 deletions
--- a/.cargo/config.toml
+++ b/.cargo/config.toml
@ -2,4 +2,4 @@
 rustflags = [
  "-C", "link-arg=-undefined",
  "-C", "link-arg=dynamic_lookup",
-]
+]
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -18,7 +18,7 @@ jobs:
          toolchain: stable
          components: rustfmt
      - name: Check Formatting
-        run: rustfmt --check src/*rs
+        run: cargo fmt --check

  Test:
    strategy:
--- a/.gitignore
+++ b/.gitignore
@ -3,9 +3,9 @@
 build
 /target
 **/*.rs.bk
-Cargo.lock
 dist/
 __pycache__/
 tantivy.so
+tantivy.dylib
 tantivy/tantivy.cpython*.so
 tantivy.egg-info/
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "tantivy"
-version = "0.17.0"
+version = "0.19.2"
 readme = "README.md"
 authors = ["Damir Jelić <poljar@termina.org.uk>"]
 edition = "2018"
@ -11,15 +11,15 @@ name = "tantivy"
 crate-type = ["cdylib"]

 [build-dependencies]
-pyo3-build-config = "0.16.3"
+pyo3-build-config = "0.18.0"

 [dependencies]
-chrono = "0.4.19"
-tantivy = "0.17"
-itertools = "0.10.3"
-futures = "0.3.21"
-serde_json = "1.0.64"
+chrono = "0.4.23"
+tantivy = "0.19.2"
+itertools = "0.10.5"
+futures = "0.3.26"
+serde_json = "1.0.91"

 [dependencies.pyo3]
-version = "0.16.3"
-features = ["extension-module"]
+version = "0.18.0"
+features = ["extension-module"]
--- a/12
+++ b/12
@ -1,4 +1,4 @@
-ifeq ($(shell UNAME),Darwin)
+ifeq ($(shell uname),Darwin)
  EXT := dylib
 else
  EXT := so
@ -6,15 +6,21 @@ endif

 source_files := $(wildcard src/*.rs)

-all: tantivy/tantivy.$(EXT)
+all: format lint build test

 PHONY: test format

+lint:
+	cargo clippy
+
 test: tantivy/tantivy.$(EXT)
 	python3 -m pytest

 format:
-	rustfmt src/*.rs
+	cargo fmt
+
+build:
+	maturin build --interpreter python3.7 python3.8 python3.9 python3.10 python3.11

 tantivy/tantivy.$(EXT): target/debug/libtantivy.$(EXT)
 	cp target/debug/libtantivy.$(EXT) tantivy/tantivy.so
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,5 +1,5 @@
 [build-system]
-requires = ["maturin"]
+requires = ["maturin>=0.13,<0.14"]
 build-backend = "maturin"

 [project]
--- a/rustfmt.toml
+++ b/rustfmt.toml
@ -1 +1 @@
-max_width = 80
+max_width = 80
--- a/src/document.rs
+++ b/src/document.rs
@ -9,7 +9,7 @@ use pyo3::{
    },
 };

-use chrono::{offset::TimeZone, Datelike, Timelike, Utc};
+use chrono::{offset::TimeZone, Utc};

 use tantivy as tv;

@ -56,37 +56,42 @@ fn value_to_py(py: Python, value: &Value) -> PyResult<PyObject> {
            // TODO implement me
            unimplemented!();
        }
-        Value::Date(d) => PyDateTime::new(
-            py,
-            d.year(),
-            d.month() as u8,
-            d.day() as u8,
-            d.hour() as u8,
-            d.minute() as u8,
-            d.second() as u8,
-            d.timestamp_subsec_micros(),
-            None,
-        )?
-        .into_py(py),
+        Value::Date(d) => {
+            let utc = d.into_utc();
+            PyDateTime::new(
+                py,
+                utc.year(),
+                utc.month() as u8,
+                utc.day(),
+                utc.hour(),
+                utc.minute(),
+                utc.second(),
+                utc.microsecond(),
+                None,
+            )?
+            .into_py(py)
+        }
        Value::Facet(f) => Facet { inner: f.clone() }.into_py(py),
        Value::JsonObject(json_object) => {
            let inner: HashMap<_, _> = json_object
                .iter()
-                .map(|(k, v)| (k, value_to_object(&v, py)))
+                .map(|(k, v)| (k, value_to_object(v, py)))
                .collect();
            inner.to_object(py)
        }
+        Value::Bool(b) => b.into_py(py),
+        Value::IpAddr(i) => (*i).to_string().into_py(py),
    })
 }

 fn value_to_string(value: &Value) -> String {
    match value {
        Value::Str(text) => text.clone(),
-        Value::U64(num) => format!("{}", num),
-        Value::I64(num) => format!("{}", num),
-        Value::F64(num) => format!("{}", num),
-        Value::Bytes(bytes) => format!("{:?}", bytes),
-        Value::Date(d) => format!("{:?}", d),
+        Value::U64(num) => format!("{num}"),
+        Value::I64(num) => format!("{num}"),
+        Value::F64(num) => format!("{num}"),
+        Value::Bytes(bytes) => format!("{bytes:?}"),
+        Value::Date(d) => format!("{d:?}"),
        Value::Facet(facet) => facet.to_string(),
        Value::PreTokStr(_pretok) => {
            // TODO implement me
@ -95,6 +100,8 @@ fn value_to_string(value: &Value) -> String {
        Value::JsonObject(json_object) => {
            serde_json::to_string(&json_object).unwrap()
        }
+        Value::Bool(b) => format!("{b}"),
+        Value::IpAddr(i) => format!("{}", *i),
    }
 }

@ -141,10 +148,10 @@ impl fmt::Debug for Document {
                    .chars()
                    .take(10)
                    .collect();
-                format!("{}=[{}]", field_name, values_str)
+                format!("{field_name}=[{values_str}]")
            })
            .join(",");
-        write!(f, "Document({})", doc_str)
+        write!(f, "Document({doc_str})")
    }
 }

@ -170,23 +177,24 @@ pub(crate) fn extract_value(any: &PyAny) -> PyResult<Value> {
    }
    if let Ok(py_datetime) = any.downcast::<PyDateTime>() {
        let datetime = Utc
-            .ymd(
+            .with_ymd_and_hms(
                py_datetime.get_year(),
                py_datetime.get_month().into(),
                py_datetime.get_day().into(),
-            )
-            .and_hms_micro(
                py_datetime.get_hour().into(),
                py_datetime.get_minute().into(),
                py_datetime.get_second().into(),
-                py_datetime.get_microsecond(),
-            );
-        return Ok(Value::Date(datetime));
+            )
+            .single()
+            .unwrap();
+        return Ok(Value::Date(tv::DateTime::from_timestamp_secs(
+            datetime.timestamp(),
+        )));
    }
    if let Ok(facet) = any.extract::<Facet>() {
-        return Ok(Value::Facet(facet.inner.clone()));
+        return Ok(Value::Facet(facet.inner));
    }
-    Err(to_pyerr(format!("Value unsupported {:?}", any)))
+    Err(to_pyerr(format!("Value unsupported {any:?}")))
 }

 fn extract_value_single_or_list(any: &PyAny) -> PyResult<Vec<Value>> {
@ -200,7 +208,7 @@ fn extract_value_single_or_list(any: &PyAny) -> PyResult<Vec<Value>> {
 #[pymethods]
 impl Document {
    #[new]
-    #[args(kwargs = "**")]
+    #[pyo3(signature = (**kwargs))]
    fn new(kwargs: Option<&PyDict>) -> PyResult<Self> {
        let mut document = Document::default();
        if let Some(field_dict) = kwargs {
@ -299,18 +307,21 @@ impl Document {
    ///     value (datetime): The date that will be added to the document.
    fn add_date(&mut self, field_name: String, value: &PyDateTime) {
        let datetime = Utc
-            .ymd(
+            .with_ymd_and_hms(
                value.get_year(),
                value.get_month().into(),
                value.get_day().into(),
-            )
-            .and_hms_micro(
                value.get_hour().into(),
                value.get_minute().into(),
                value.get_second().into(),
-                value.get_microsecond(),
-            );
-        add_value(self, field_name, datetime);
+            )
+            .single()
+            .unwrap();
+        add_value(
+            self,
+            field_name,
+            tv::DateTime::from_timestamp_secs(datetime.timestamp()),
+        );
    }

    /// Add a facet value to the document.
@ -387,13 +398,13 @@ impl Document {
    }

    fn __getitem__(&self, field_name: &str) -> PyResult<Vec<PyObject>> {
-        let gil = Python::acquire_gil();
-        let py = gil.python();
-        self.get_all(py, field_name)
+        Python::with_gil(|py| -> PyResult<Vec<PyObject>> {
+            self.get_all(py, field_name)
+        })
    }

    fn __repr__(&self) -> PyResult<String> {
-        Ok(format!("{:?}", self))
+        Ok(format!("{self:?}"))
    }
 }

--- a/src/index.rs
+++ b/src/index.rs
@ -14,6 +14,10 @@ use tantivy as tv;
 use tantivy::{
    directory::MmapDirectory,
    schema::{NamedFieldDocument, Term, Value},
+    tokenizer::{
+        Language, LowerCaser, RemoveLongFilter, SimpleTokenizer, Stemmer,
+        TextAnalyzer,
+    },
 };

 const RELOAD_POLICY: &str = "commit";
@ -120,26 +124,25 @@ impl IndexWriter {
            Value::U64(num) => Term::from_field_u64(field, num),
            Value::I64(num) => Term::from_field_i64(field, num),
            Value::F64(num) => Term::from_field_f64(field, num),
-            Value::Date(d) => Term::from_field_date(field, &d),
+            Value::Date(d) => Term::from_field_date(field, d),
            Value::Facet(facet) => Term::from_facet(field, &facet),
            Value::Bytes(_) => {
                return Err(exceptions::PyValueError::new_err(format!(
-                    "Field `{}` is bytes type not deletable.",
-                    field_name
+                    "Field `{field_name}` is bytes type not deletable."
                )))
            }
            Value::PreTokStr(_pretok) => {
                return Err(exceptions::PyValueError::new_err(format!(
-                    "Field `{}` is pretokenized. This is not authorized for delete.",
-                    field_name
+                    "Field `{field_name}` is pretokenized. This is not authorized for delete."
                )))
            }
            Value::JsonObject(_) => {
                return Err(exceptions::PyValueError::new_err(format!(
-                    "Field `{}` is json object type not deletable.",
-                    field_name
+                    "Field `{field_name}` is json object type not deletable."
                )))
-            }
+            },
+            Value::Bool(b) => Term::from_field_bool(field, b),
+            Value::IpAddr(i) => Term::from_field_ip_addr(field, i)
        };
        Ok(self.inner_index_writer.delete_term(term))
    }
@ -167,12 +170,15 @@ impl Index {
    #[staticmethod]
    fn open(path: &str) -> PyResult<Index> {
        let index = tv::Index::open_in_dir(path).map_err(to_pyerr)?;
+
+        Index::register_custom_text_analyzers(&index);
+
        let reader = index.reader().map_err(to_pyerr)?;
        Ok(Index { index, reader })
    }

    #[new]
-    #[args(reuse = true)]
+    #[pyo3(signature = (schema, path = None, reuse = true))]
    fn new(schema: &Schema, path: Option<&str>, reuse: bool) -> PyResult<Self> {
        let index = match path {
            Some(p) => {
@ -191,6 +197,8 @@ impl Index {
            None => tv::Index::create_in_ram(schema.inner.clone()),
        };

+        Index::register_custom_text_analyzers(&index);
+
        let reader = index.reader().map_err(to_pyerr)?;
        Ok(Index { index, reader })
    }
@ -208,7 +216,7 @@ impl Index {
    ///         automatically the number of threads.
    ///
    /// Raises ValueError if there was an error while creating the writer.
-    #[args(heap_size = 3000000, num_threads = 0)]
+    #[pyo3(signature = (heap_size = 3000000, num_threads = 0))]
    fn writer(
        &self,
        heap_size: usize,
@ -231,13 +239,13 @@ impl Index {
    /// Args:
    ///     reload_policy (str, optional): The reload policy that the
    ///         IndexReader should use. Can be `Manual` or `OnCommit`.
-    ///     num_searchers (int, optional): The number of searchers that the
+    ///     num_warmers (int, optional): The number of searchers that the
    ///         reader should create.
-    #[args(reload_policy = "RELOAD_POLICY", num_searchers = 0)]
+    #[pyo3(signature = (reload_policy = RELOAD_POLICY, num_warmers = 0))]
    fn config_reader(
        &mut self,
        reload_policy: &str,
-        num_searchers: usize,
+        num_warmers: usize,
    ) -> Result<(), PyErr> {
        let reload_policy = reload_policy.to_lowercase();
        let reload_policy = match reload_policy.as_ref() {
@ -251,8 +259,8 @@ impl Index {
        };
        let builder = self.index.reader_builder();
        let builder = builder.reload_policy(reload_policy);
-        let builder = if num_searchers > 0 {
-            builder.num_searchers(num_searchers)
+        let builder = if num_warmers > 0 {
+            builder.num_warming_threads(num_warmers)
        } else {
            builder
        };
@ -313,7 +321,7 @@ impl Index {
    ///     default_fields_names (List[Field]): A list of fields used to search if no
    ///         field is specified in the query.
    ///
-    #[args(reload_policy = "RELOAD_POLICY")]
+    #[pyo3(signature = (query, default_field_names = None))]
    pub fn parse_query(
        &self,
        query: &str,
@ -328,16 +336,14 @@ impl Index {
                    if !field_entry.is_indexed() {
                        return Err(exceptions::PyValueError::new_err(
                            format!(
-                            "Field `{}` is not set as indexed in the schema.",
-                            default_field_name
+                            "Field `{default_field_name}` is not set as indexed in the schema."
                        ),
                        ));
                    }
                    default_fields.push(field);
                } else {
                    return Err(exceptions::PyValueError::new_err(format!(
-                        "Field `{}` is not defined in the schema.",
-                        default_field_name
+                        "Field `{default_field_name}` is not defined in the schema."
                    )));
                }
            }
@ -355,3 +361,35 @@ impl Index {
        Ok(Query { inner: query })
    }
 }
+
+impl Index {
+    fn register_custom_text_analyzers(index: &tv::Index) {
+        let analyzers = [
+            ("ar_stem", Language::Arabic),
+            ("da_stem", Language::Danish),
+            ("nl_stem", Language::Dutch),
+            ("fi_stem", Language::Finnish),
+            ("fr_stem", Language::French),
+            ("de_stem", Language::German),
+            ("el_stem", Language::Greek),
+            ("hu_stem", Language::Hungarian),
+            ("it_stem", Language::Italian),
+            ("no_stem", Language::Norwegian),
+            ("pt_stem", Language::Portuguese),
+            ("ro_stem", Language::Romanian),
+            ("ru_stem", Language::Russian),
+            ("es_stem", Language::Spanish),
+            ("sv_stem", Language::Swedish),
+            ("ta_stem", Language::Tamil),
+            ("tr_stem", Language::Turkish),
+        ];
+
+        for (name, lang) in &analyzers {
+            let an = TextAnalyzer::from(SimpleTokenizer)
+                .filter(RemoveLongFilter::limit(40))
+                .filter(LowerCaser)
+                .filter(Stemmer::new(*lang));
+            index.tokenizers().register(name, an);
+        }
+    }
+}
--- a/src/lib.rs
+++ b/src/lib.rs
@ -1,5 +1,5 @@
+use ::tantivy as tv;
 use pyo3::{exceptions, prelude::*};
-use tantivy as tv;

 mod document;
 mod facet;
@ -88,8 +88,7 @@ pub(crate) fn get_field(
 ) -> PyResult<tv::schema::Field> {
    let field = schema.get_field(field_name).ok_or_else(|| {
        exceptions::PyValueError::new_err(format!(
-            "Field `{}` is not defined in the schema.",
-            field_name
+            "Field `{field_name}` is not defined in the schema."
        ))
    })?;

--- a/src/schemabuilder.rs
+++ b/src/schemabuilder.rs
@ -6,7 +6,7 @@ use tantivy::schema;

 use crate::schema::Schema;
 use std::sync::{Arc, RwLock};
-use tantivy::schema::INDEXED;
+use tantivy::schema::{DateOptions, INDEXED};

 /// Tantivy has a very strict schema.
 /// You need to specify in advance whether a field is indexed or not,
@ -60,11 +60,12 @@ impl SchemaBuilder {
    ///
    /// Returns the associated field handle.
    /// Raises a ValueError if there was an error with the field creation.
-    #[args(
+    #[pyo3(signature = (
+        name,
        stored = false,
-        tokenizer_name = "TOKENIZER",
-        index_option = "RECORD"
-    )]
+        tokenizer_name = TOKENIZER,
+        index_option = RECORD
+    ))]
    fn add_text_field(
        &mut self,
        name: &str,
@ -109,7 +110,7 @@ impl SchemaBuilder {
    ///
    /// Returns the associated field handle.
    /// Raises a ValueError if there was an error with the field creation.
-    #[args(stored = false, indexed = false)]
+    #[pyo3(signature = (name, stored = false, indexed = false, fast = None))]
    fn add_integer_field(
        &mut self,
        name: &str,
@ -151,7 +152,7 @@ impl SchemaBuilder {
    ///
    /// Returns the associated field handle.
    /// Raises a ValueError if there was an error with the field creation.
-    #[args(stored = false, indexed = false)]
+    #[pyo3(signature = (name, stored = false, indexed = false, fast = None))]
    fn add_unsigned_field(
        &mut self,
        name: &str,
@ -185,7 +186,7 @@ impl SchemaBuilder {
    ///         field. Fast fields are designed for random access. Access time
    ///         are similar to a random lookup in an array. If more than one
    ///         value is associated to a fast field, only the last one is kept.
-    ///         Can be one of 'single' or 'multi'. If this is set to 'single,
+    ///         Can be one of 'single' or 'multi'. If this is set to 'single',
    ///         the document must have exactly one value associated to the
    ///         document. If this is set to 'multi', the document can have any
    ///         number of values associated to the document. Defaults to None,
@ -193,7 +194,7 @@ impl SchemaBuilder {
    ///
    /// Returns the associated field handle.
    /// Raises a ValueError if there was an error with the field creation.
-    #[args(stored = false, indexed = false)]
+    #[pyo3(signature = (name, stored = false, indexed = false, fast = None))]
    fn add_date_field(
        &mut self,
        name: &str,
@ -203,7 +204,29 @@ impl SchemaBuilder {
    ) -> PyResult<Self> {
        let builder = &mut self.builder;

-        let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?;
+        let mut opts = DateOptions::default();
+        if stored {
+            opts = opts.set_stored();
+        }
+        if indexed {
+            opts = opts.set_indexed();
+        }
+        let fast = match fast {
+            Some(f) => {
+                let f = f.to_lowercase();
+                match f.as_ref() {
+                    "single" => Some(schema::Cardinality::SingleValue),
+                    "multi" => Some(schema::Cardinality::MultiValues),
+                    _ => return Err(exceptions::PyValueError::new_err(
+                        "Invalid index option, valid choices are: 'multi' and 'single'"
+                    )),
+                }
+            }
+            None => None,
+        };
+        if let Some(f) = fast {
+            opts = opts.set_fast(f);
+        }

        if let Some(builder) = builder.write().unwrap().as_mut() {
            builder.add_date_field(name, opts);
@ -234,11 +257,12 @@ impl SchemaBuilder {
    ///
    /// Returns the associated field handle.
    /// Raises a ValueError if there was an error with the field creation.
-    #[args(
+    #[pyo3(signature = (
+        name,
        stored = false,
-        tokenizer_name = "TOKENIZER",
-        index_option = "RECORD"
-    )]
+        tokenizer_name = TOKENIZER,
+        index_option = RECORD
+    ))]
    fn add_json_field(
        &mut self,
        name: &str,
--- a/src/searcher.rs
+++ b/src/searcher.rs
@ -10,7 +10,7 @@ use tantivy::collector::{Count, MultiCollector, TopDocs};
 /// A Searcher is used to search the index given a prepared Query.
 #[pyclass]
 pub(crate) struct Searcher {
-    pub(crate) inner: tv::LeasedItem<tv::Searcher>,
+    pub(crate) inner: tv::Searcher,
 }

 #[derive(Clone)]
@ -22,8 +22,8 @@ enum Fruit {
 impl std::fmt::Debug for Fruit {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
-            Fruit::Score(s) => f.write_str(&format!("{}", s)),
-            Fruit::Order(o) => f.write_str(&format!("{}", o)),
+            Fruit::Score(s) => f.write_str(&format!("{s}")),
+            Fruit::Order(o) => f.write_str(&format!("{o}")),
        }
    }
 }
@ -93,7 +93,7 @@ impl Searcher {
    /// Returns `SearchResult` object.
    ///
    /// Raises a ValueError if there was an error with the search.
-    #[args(limit = 10, offset = 0, count = true)]
+    #[pyo3(signature = (query, limit = 10, count = true, order_by_field = None, offset = 0))]
    fn search(
        &self,
        _py: Python,
@ -154,10 +154,7 @@ impl Searcher {
            }
        };

-        let count = match count_handle {
-            Some(h) => Some(h.extract(&mut multifruit)),
-            None => None,
-        };
+        let count = count_handle.map(|h| h.extract(&mut multifruit));

        Ok(SearchResult { hits, count })
    }
@ -230,11 +227,11 @@ impl From<&tv::DocAddress> for DocAddress {
    }
 }

-impl Into<tv::DocAddress> for &DocAddress {
-    fn into(self) -> tv::DocAddress {
+impl From<&DocAddress> for tv::DocAddress {
+    fn from(val: &DocAddress) -> Self {
        tv::DocAddress {
-            segment_ord: self.segment_ord(),
-            doc_id: self.doc(),
+            segment_ord: val.segment_ord(),
+            doc_id: val.doc(),
        }
    }
 }
--- a/tantivy/init.py
+++ b/tantivy/init.py
@ -0,0 +1 @@
+from .tantivy import *
--- a/tests/tantivy_test.py
+++ b/tests/tantivy_test.py
@ -67,6 +67,56 @@ def create_index(dir=None):
    return index


+def spanish_schema():
+    return (
+        SchemaBuilder()
+        .add_text_field("title", stored=True, tokenizer_name='es_stem')
+        .add_text_field("body", tokenizer_name='es_stem')
+        .build()
+    )
+
+
+def create_spanish_index():
+    # assume all tests will use the same documents for now
+    # other methods may set up function-local indexes
+    index = Index(spanish_schema(), None)
+    writer = index.writer()
+
+    # 2 ways of adding documents
+    # 1
+    doc = Document()
+    # create a document instance
+    # add field-value pairs
+    doc.add_text("title", "El viejo y el mar")
+    doc.add_text(
+        "body",
+        (
+            "Era un viejo que pescaba solo en un bote en el Gulf Stream y hacía ochenta y cuatro días que no cogía un pez. "
+        ),
+    )
+    writer.add_document(doc)
+    # 2 use the built-in json support
+    # keys need to coincide with field names
+    doc = Document.from_dict(
+        {
+            "title": "De ratones y hombres",
+            "body": (
+                "Unas millas al sur de Soledad, el río Salinas se ahonda junto al margen de la ladera y fluye profundo y verde. Es tibia el agua, porque se ha deslizado chispeante sobre la arena amarilla y al calor del sol antes de llegar a la angosta laguna. A un lado del río, la dorada falda de la ladera se curva hacia arriba trepando hasta las montañas Gabilán, fuertes y rocosas, pero del lado del valle los árboles bordean la orilla: sauces frescos y verdes cada primavera, que en la s junturas más bajas de sus hojas muestran las consecuencias de la crecida invernal; y sicomoros de troncos veteados, blancos, recostados, y ramas quesear quean sobre el estanque"
+            ),
+        }
+    )
+    writer.add_document(doc)
+    writer.add_json(
+        """{
+            "title": ["Frankenstein", "El moderno Prometeo"],
+            "body": "Te alegrará saber que no ha ocurrido ningún percance al principio de una aventura que siempre consideraste cargada de malos presagios. Llegué aquí ayer, y mi primera tarea es asegurarle a mi querida hermana que me hallo perfectamente y que tengo una gran confianza en el éxito de mi empresa."
+        }"""
+    )
+    writer.commit()
+    index.reload()
+    return index
+
+
@pytest.fixture()
 def dir_index(tmpdir):
    return (tmpdir, create_index(str(tmpdir)))
@ -77,6 +127,11 @@ def ram_index():
    return create_index()


+@pytest.fixture(scope="class")
+def spanish_index():
+    return create_spanish_index()
+
+
 class TestClass(object):
    def test_simple_search_in_dir(self, dir_index):
        _, index = dir_index
@ -103,6 +158,16 @@ class TestClass(object):
        searched_doc = index.searcher().doc(doc_address)
        assert searched_doc["title"] == ["The Old Man and the Sea"]

+    def test_simple_search_in_spanish(self, spanish_index):
+        index = spanish_index
+        query = index.parse_query("vieja", ["title", "body"])
+
+        result = index.searcher().search(query, 10)
+        assert len(result.hits) == 1
+        _, doc_address = result.hits[0]
+        search_doc = index.searcher().doc(doc_address)
+        assert search_doc["title"] == ["El viejo y el mar"]
+
    def test_and_query(self, ram_index):
        index = ram_index
        query = index.parse_query(