From b377f570efc4268bb0d4998d55df836825cf4f51 Mon Sep 17 00:00:00 2001 From: Chris Tam Date: Fri, 21 Jul 2023 18:13:03 -0400 Subject: [PATCH] Add schema validation to PyDict -> Document (#88) * Add schema validation to PyDict -> Document * Address comments * Add documentation about new functionality --- Cargo.lock | 336 ++++++++++++++++++++++++++++++++++-------- Cargo.toml | 2 +- src/document.rs | 222 ++++++++++++++++++++++------ tests/tantivy_test.py | 90 +++++++++-- 4 files changed, 532 insertions(+), 118 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 64341ea..9ea27cc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,20 +4,20 @@ version = 3 [[package]] name = "ahash" -version = "0.7.6" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47" +checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f" dependencies = [ - "getrandom", + "cfg-if", "once_cell", "version_check", ] [[package]] name = "aho-corasick" -version = "0.7.20" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac" +checksum = "43f6cb1bf222025340178f382c426f13757b2960e89779dfcb319c32542a5a41" dependencies = [ "memchr", ] @@ -56,9 +56,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "base64" -version = "0.13.1" +version = "0.21.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" +checksum = "604178f6c5c21f02dc555784810edfb88d34ac2c73b2eae109655649ee73ce3d" [[package]] name = "bitflags" @@ -66,6 +66,12 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "bitflags" +version = "2.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42" + [[package]] name = "bitpacking" version = "0.8.4" @@ -92,6 +98,9 @@ name = "cc" version = "1.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" +dependencies = [ + "jobserver", +] [[package]] name = "census" @@ -259,6 +268,27 @@ version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91" +[[package]] +name = "errno" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" +dependencies = [ + "errno-dragonfly", + "libc", + "windows-sys 0.48.0", +] + +[[package]] +name = "errno-dragonfly" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +dependencies = [ + "cc", + "libc", +] + [[package]] name = "fail" version = "0.5.1" @@ -276,20 +306,6 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "25c7df09945d65ea8d70b3321547ed414bbc540aad5bac6883d021b970f35b04" -[[package]] -name = "fastfield_codecs" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "374a3a53c1bd5fb31b10084229290eafb0a05f260ec90f1f726afffda4877a8a" -dependencies = [ - "fastdivide", - "itertools", - "log", - "ownedbytes", - "tantivy-bitpacker", - "tantivy-common", -] - [[package]] name = "fastrand" version = "1.8.0" @@ -300,13 +316,19 @@ dependencies = [ ] [[package]] -name = "fs2" -version = "0.4.3" +name = "fnv" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "fs4" +version = "0.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2eeb4ed9e12f43b7fa0baae3f9cdda28352770132ef2e09a23760c29cae8bd47" dependencies = [ - "libc", - "winapi", + "rustix", + "windows-sys 0.48.0", ] [[package]] @@ -424,9 +446,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.12.3" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" +checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e" dependencies = [ "ahash", ] @@ -503,6 +525,15 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fad582f4b9e86b6caa621cabeb0963332d92eea04729ab12892c2533951e6440" +[[package]] +name = "jobserver" +version = "0.1.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2" +dependencies = [ + "libc", +] + [[package]] name = "js-sys" version = "0.3.61" @@ -526,9 +557,9 @@ checksum = "0c2cdeb66e45e9f36bfad5bbdb4d2384e70936afbee843c6f6543f0c551ebb25" [[package]] name = "libc" -version = "0.2.139" +version = "0.2.147" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79" +checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" [[package]] name = "link-cplusplus" @@ -539,6 +570,12 @@ dependencies = [ "cc", ] +[[package]] +name = "linux-raw-sys" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09fc20d2ca12cb9f044c93e3bd6d32d523e6e2ec3db4f7b2939cd99026ecd3f0" + [[package]] name = "lock_api" version = "0.4.9" @@ -574,18 +611,18 @@ dependencies = [ [[package]] name = "lru" -version = "0.7.8" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e999beba7b6e8345721bd280141ed958096a2e4abdf74f67ff4ce49b4b54e47a" +checksum = "718e8fae447df0c7e1ba7f5189829e63fd536945c8988d61444c19039f16b670" dependencies = [ "hashbrown", ] [[package]] name = "lz4_flex" -version = "0.9.5" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a8cbbb2831780bc3b9c15a41f5b49222ef756b6730a95f3decfdd15903eb5a3" +checksum = "8b8c72594ac26bfd34f2d99dfced2edfaddfe8a476e3ff2ca0eb293d925c4f83" [[package]] name = "matchers" @@ -614,9 +651,9 @@ checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" [[package]] name = "memmap2" -version = "0.5.8" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b182332558b18d807c4ce1ca8ca983b34c3ee32765e47b3f0f69b90355cc1dc" +checksum = "6d28bba84adfe6646737845bc5ebbfa2c08424eb1c37e94a1fd2a82adb56a872" dependencies = [ "libc", ] @@ -641,12 +678,9 @@ dependencies = [ [[package]] name = "murmurhash32" -version = "0.2.0" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d736ff882f0e85fe9689fb23db229616c4c00aee2b3ac282f666d8f20eb25d4a" -dependencies = [ - "byteorder", -] +checksum = "d9380db4c04d219ac5c51d14996bbf2c2e9a15229771b53f8671eb6c83cf44df" [[package]] name = "nu-ansi-term" @@ -710,9 +744,9 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" [[package]] name = "ownedbytes" -version = "0.4.0" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e957eaa64a299f39755416e5b3128c505e9d63a91d0453771ad2ccd3907f8db" +checksum = "c718e498b20704d5fb5d51d07f414a22f61c19254c1708e117b93fd76860739c" dependencies = [ "stable_deref_trait", ] @@ -737,7 +771,7 @@ dependencies = [ "libc", "redox_syscall", "smallvec", - "windows-sys", + "windows-sys 0.45.0", ] [[package]] @@ -752,6 +786,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "pkg-config" +version = "0.3.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" + [[package]] name = "ppv-lite86" version = "0.2.17" @@ -774,6 +814,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06a3d8e8a46ab2738109347433cb7b96dffda2e4a218b03ef27090238886b147" dependencies = [ "cfg-if", + "chrono", "indoc", "libc", "memoffset 0.8.0", @@ -894,7 +935,7 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" dependencies = [ - "bitflags", + "bitflags 1.3.2", ] [[package]] @@ -946,6 +987,19 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" +[[package]] +name = "rustix" +version = "0.38.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a962918ea88d644592894bc6dc55acc6c0956488adcebbfb6e273506b7fd6e5" +dependencies = [ + "bitflags 2.3.3", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.48.0", +] + [[package]] name = "rustversion" version = "1.0.11" @@ -1016,6 +1070,15 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "sketches-ddsketch" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68a406c1882ed7f29cd5e248c9848a80e7cb6ae0fea82346d2746f2f941c07e1" +dependencies = [ + "serde", +] + [[package]] name = "slab" version = "0.4.7" @@ -1050,7 +1113,7 @@ dependencies = [ [[package]] name = "tantivy" -version = "0.19.2" +version = "0.20.1" dependencies = [ "chrono", "futures", @@ -1058,14 +1121,14 @@ dependencies = [ "pyo3", "pyo3-build-config", "serde_json", - "tantivy 0.19.2 (registry+https://github.com/rust-lang/crates.io-index)", + "tantivy 0.20.2", ] [[package]] name = "tantivy" -version = "0.19.2" +version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5bb26a6b22c84d8be41d99a14016d6f04d30d8d31a2ea411a8ab553af5cc490d" +checksum = "aec540e9cebc88f523f67f596dee213e491f0c55961de013566f267a0c31f5e9" dependencies = [ "aho-corasick", "arc-swap", @@ -1079,8 +1142,7 @@ dependencies = [ "downcast-rs", "fail", "fastdivide", - "fastfield_codecs", - "fs2", + "fs4", "htmlescape", "itertools", "levenshtein_automata", @@ -1093,19 +1155,21 @@ dependencies = [ "num_cpus", "once_cell", "oneshot", - "ownedbytes", "rayon", "regex", "rust-stemmers", "rustc-hash", "serde", "serde_json", + "sketches-ddsketch", "smallvec", - "stable_deref_trait", "tantivy-bitpacker", + "tantivy-columnar", "tantivy-common", "tantivy-fst", "tantivy-query-grammar", + "tantivy-stacker", + "tantivy-tokenizer-api", "tempfile", "thiserror", "time 0.3.17", @@ -1115,18 +1179,40 @@ dependencies = [ [[package]] name = "tantivy-bitpacker" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e71a0c95b82d4292b097a09b989a6380d28c3a86800c841a2d03bae1fc8b9fa6" +checksum = "16099e96f0ede682084469b80d6909dc170aa2b11d2a45538b5b36b2a90090b9" +dependencies = [ + "bitpacking", +] + +[[package]] +name = "tantivy-columnar" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56e32b024b26eab93eb8648faf08004356bf9d47376557ee4409f4b210163656" +dependencies = [ + "fastdivide", + "fnv", + "itertools", + "serde", + "tantivy-bitpacker", + "tantivy-common", + "tantivy-sstable", + "tantivy-stacker", +] [[package]] name = "tantivy-common" -version = "0.4.0" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14fef4182bb60df9a4b92cd8ecab39ba2e50a05542934af17eef1f49660705cb" +checksum = "e7d12fdd6ec0f7e0962f129c03c696a85ec567734950cbb2b89af4a293ce342f" dependencies = [ + "async-trait", "byteorder", "ownedbytes", + "serde", + "time 0.3.17", ] [[package]] @@ -1142,15 +1228,45 @@ dependencies = [ [[package]] name = "tantivy-query-grammar" -version = "0.19.0" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "343e3ada4c1c480953f6960f8a21ce9c76611480ffdd4f4e230fdddce0fc5331" +checksum = "106d8f78ad1da4f0fdd526a0760c326c0573510d4dedabeb1962d35a35879797" dependencies = [ "combine", "once_cell", "regex", ] +[[package]] +name = "tantivy-sstable" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eda34243d3ee64bd8f9ba74a3b0d05f4d07beff7767a727212e9b5a19c13dde7" +dependencies = [ + "tantivy-common", + "tantivy-fst", + "zstd", +] + +[[package]] +name = "tantivy-stacker" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b9e9470301b026ad3b95f79a791a2a3ee81f3ab16fbe412a9dd81ff834acf5" +dependencies = [ + "murmurhash32", + "tantivy-common", +] + +[[package]] +name = "tantivy-tokenizer-api" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64186801b6e06b3a1c4275e23b517835ff4ecbb707318b838dc9de457c062200" +dependencies = [ + "serde", +] + [[package]] name = "target-lexicon" version = "0.12.6" @@ -1482,7 +1598,16 @@ version = "0.45.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" dependencies = [ - "windows-targets", + "windows-targets 0.42.1", +] + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.1", ] [[package]] @@ -1491,21 +1616,42 @@ version = "0.42.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e2522491fbfcd58cc84d47aeb2958948c4b8982e9a2d8a2a35bbaed431390e7" dependencies = [ - "windows_aarch64_gnullvm", + "windows_aarch64_gnullvm 0.42.1", "windows_aarch64_msvc 0.42.1", "windows_i686_gnu 0.42.1", "windows_i686_msvc 0.42.1", "windows_x86_64_gnu 0.42.1", - "windows_x86_64_gnullvm", + "windows_x86_64_gnullvm 0.42.1", "windows_x86_64_msvc 0.42.1", ] +[[package]] +name = "windows-targets" +version = "0.48.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05d4b17490f70499f20b9e791dcf6a299785ce8af4d709018206dc5b4953e95f" +dependencies = [ + "windows_aarch64_gnullvm 0.48.0", + "windows_aarch64_msvc 0.48.0", + "windows_i686_gnu 0.48.0", + "windows_i686_msvc 0.48.0", + "windows_x86_64_gnu 0.48.0", + "windows_x86_64_gnullvm 0.48.0", + "windows_x86_64_msvc 0.48.0", +] + [[package]] name = "windows_aarch64_gnullvm" version = "0.42.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" + [[package]] name = "windows_aarch64_msvc" version = "0.39.0" @@ -1518,6 +1664,12 @@ version = "0.42.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7" +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" + [[package]] name = "windows_i686_gnu" version = "0.39.0" @@ -1530,6 +1682,12 @@ version = "0.42.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640" +[[package]] +name = "windows_i686_gnu" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" + [[package]] name = "windows_i686_msvc" version = "0.39.0" @@ -1542,6 +1700,12 @@ version = "0.42.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605" +[[package]] +name = "windows_i686_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" + [[package]] name = "windows_x86_64_gnu" version = "0.39.0" @@ -1554,12 +1718,24 @@ version = "0.42.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45" +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" + [[package]] name = "windows_x86_64_gnullvm" version = "0.42.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" + [[package]] name = "windows_x86_64_msvc" version = "0.39.0" @@ -1571,3 +1747,39 @@ name = "windows_x86_64_msvc" version = "0.42.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" + +[[package]] +name = "zstd" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "6.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581" +dependencies = [ + "libc", + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.8+zstd.1.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c" +dependencies = [ + "cc", + "libc", + "pkg-config", +] diff --git a/Cargo.toml b/Cargo.toml index 83ac924..5f36877 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,4 +22,4 @@ serde_json = "1.0.91" [dependencies.pyo3] version = "0.18.0" -features = ["extension-module"] +features = ["chrono", "extension-module"] diff --git a/src/document.rs b/src/document.rs index 1f5d4d6..2a5a41a 100644 --- a/src/document.rs +++ b/src/document.rs @@ -9,11 +9,11 @@ use pyo3::{ }, }; -use chrono::{offset::TimeZone, Utc}; +use chrono::{offset::TimeZone, NaiveDateTime, Utc}; use tantivy as tv; -use crate::{facet::Facet, to_pyerr}; +use crate::{facet::Facet, schema::Schema, to_pyerr}; use serde_json::Value as JsonValue; use std::{ collections::{BTreeMap, HashMap}, @@ -128,7 +128,25 @@ fn value_to_string(value: &Value) -> String { /// /// Example: /// >>> doc = tantivy.Document(title="The Old Man and the Sea", body="...") - +/// +/// For numeric fields, the [`Document`] constructor does not have any +/// information about the type and will try to guess the type. +/// Therefore, it is recommended to use the [`Document::from_dict()`], +/// [`Document::extract()`], or `Document::add_*()` functions to provide +/// explicit type information. +/// +/// Example: +/// >>> schema = ( +/// SchemaBuilder() +/// .add_unsigned_field("unsigned") +/// .add_integer_field("signed") +/// .add_float_field("float") +/// .build() +/// ) +/// >>> doc = tantivy.Document.from_dict( +/// {"unsigned": 1000, "signed": -5, "float": 0.4}, +/// schema, +/// ) #[pyclass] #[derive(Default)] pub(crate) struct Document { @@ -175,18 +193,7 @@ pub(crate) fn extract_value(any: &PyAny) -> PyResult { if let Ok(num) = any.extract::() { return Ok(Value::F64(num)); } - if let Ok(py_datetime) = any.downcast::() { - let datetime = Utc - .with_ymd_and_hms( - py_datetime.get_year(), - py_datetime.get_month().into(), - py_datetime.get_day().into(), - py_datetime.get_hour().into(), - py_datetime.get_minute().into(), - py_datetime.get_second().into(), - ) - .single() - .unwrap(); + if let Ok(datetime) = any.extract::() { return Ok(Value::Date(tv::DateTime::from_timestamp_secs( datetime.timestamp(), ))); @@ -200,6 +207,60 @@ pub(crate) fn extract_value(any: &PyAny) -> PyResult { Err(to_pyerr(format!("Value unsupported {any:?}"))) } +pub(crate) fn extract_value_for_type( + any: &PyAny, + tv_type: tv::schema::Type, + field_name: &str, +) -> PyResult { + // Helper function to create `PyErr`s returned by this function. + fn to_pyerr_for_type<'a, E: std::error::Error>( + type_name: &'a str, + field_name: &'a str, + any: &'a PyAny, + ) -> impl Fn(E) -> PyErr + 'a { + move |_| { + to_pyerr(format!( + "Expected {} type for field {}, got {:?}", + type_name, field_name, any + )) + } + } + + let value = match tv_type { + tv::schema::Type::Str => Value::Str( + any.extract::() + .map_err(to_pyerr_for_type("Str", field_name, any))?, + ), + tv::schema::Type::U64 => Value::U64( + any.extract::() + .map_err(to_pyerr_for_type("U64", field_name, any))?, + ), + tv::schema::Type::I64 => Value::I64( + any.extract::() + .map_err(to_pyerr_for_type("I64", field_name, any))?, + ), + tv::schema::Type::F64 => Value::F64( + any.extract::() + .map_err(to_pyerr_for_type("F64", field_name, any))?, + ), + tv::schema::Type::Date => { + let datetime = any + .extract::() + .map_err(to_pyerr_for_type("DateTime", field_name, any))?; + + Value::Date(tv::DateTime::from_timestamp_secs(datetime.timestamp())) + } + tv::schema::Type::Facet => Value::Facet( + any.extract::() + .map_err(to_pyerr_for_type("Facet", field_name, any))? + .inner, + ), + _ => return Err(to_pyerr(format!("Value unsupported {:?}", any))), + }; + + Ok(value) +} + fn extract_value_single_or_list(any: &PyAny) -> PyResult> { if let Ok(values) = any.downcast::() { values.iter().map(extract_value).collect() @@ -208,51 +269,124 @@ fn extract_value_single_or_list(any: &PyAny) -> PyResult> { } } +fn extract_value_single_or_list_for_type( + any: &PyAny, + field_type: &tv::schema::FieldType, + field_name: &str, +) -> PyResult> { + // Check if a numeric fast field supports multivalues. + if let Ok(values) = any.downcast::() { + values + .iter() + .map(|any| { + extract_value_for_type(any, field_type.value_type(), field_name) + }) + .collect() + } else { + Ok(vec![extract_value_for_type( + any, + field_type.value_type(), + field_name, + )?]) + } +} + +impl Document { + fn extract_py_values_from_dict( + py_dict: &PyDict, + schema: Option<&Schema>, + out_field_values: &mut BTreeMap>, + ) -> PyResult<()> { + // TODO: Reserve when https://github.com/rust-lang/rust/issues/72631 is stable. + // out_field_values.reserve(py_dict.len()); + + for key_value_any in py_dict.items() { + if let Ok(key_value) = key_value_any.downcast::() { + if key_value.len() != 2 { + continue; + } + let key = key_value.get_item(0)?.extract::()?; + + let field_type = if let Some(schema) = schema { + let field_type = schema + .inner + .get_field(key.as_str()) + .map(|field| { + schema.inner.get_field_entry(field).field_type() + }) + .ok(); + + if let Some(field_type) = field_type { + // A field type was found, so validate it after the values are extracted. + Some(field_type) + } else { + // The field does not exist in the schema, so skip over it. + continue; + } + } else { + // No schema was provided, so do not validate anything. + None + }; + + let value_list = if let Some(field_type) = field_type { + extract_value_single_or_list_for_type( + key_value.get_item(1)?, + field_type, + key.as_str(), + )? + } else { + extract_value_single_or_list(key_value.get_item(1)?)? + }; + + out_field_values.insert(key, value_list); + } + } + + Ok(()) + } +} + #[pymethods] impl Document { + /// Creates a new document with optional fields from `**kwargs`. + /// + /// Note that the types of numeric fields are unknown here. To + /// provide explicit type information, use the [`from_dict()`], + /// [`extend()`], or `add_()` functions. #[new] #[pyo3(signature = (**kwargs))] fn new(kwargs: Option<&PyDict>) -> PyResult { let mut document = Document::default(); if let Some(field_dict) = kwargs { - document.extend(field_dict)?; + document.extend(field_dict, None)?; } Ok(document) } - fn extend(&mut self, py_dict: &PyDict) -> PyResult<()> { - let mut field_values: BTreeMap> = - BTreeMap::new(); - for key_value_any in py_dict.items() { - if let Ok(key_value) = key_value_any.downcast::() { - if key_value.len() != 2 { - continue; - } - let key: String = key_value.get_item(0)?.extract()?; - let value_list = - extract_value_single_or_list(key_value.get_item(1)?)?; - field_values.insert(key, value_list); - } - } - self.field_values.extend(field_values.into_iter()); - Ok(()) + fn extend( + &mut self, + py_dict: &PyDict, + schema: Option<&Schema>, + ) -> PyResult<()> { + Document::extract_py_values_from_dict( + py_dict, + schema, + &mut self.field_values, + ) } #[staticmethod] - fn from_dict(py_dict: &PyDict) -> PyResult { + fn from_dict( + py_dict: &PyDict, + schema: Option<&Schema>, + ) -> PyResult { let mut field_values: BTreeMap> = BTreeMap::new(); - for key_value_any in py_dict.items() { - if let Ok(key_value) = key_value_any.downcast::() { - if key_value.len() != 2 { - continue; - } - let key: String = key_value.get_item(0)?.extract()?; - let value_list = - extract_value_single_or_list(key_value.get_item(1)?)?; - field_values.insert(key, value_list); - } - } + Document::extract_py_values_from_dict( + py_dict, + schema, + &mut field_values, + )?; Ok(Document { field_values }) } diff --git a/tests/tantivy_test.py b/tests/tantivy_test.py index 328c535..c0ea109 100644 --- a/tests/tantivy_test.py +++ b/tests/tantivy_test.py @@ -13,6 +13,7 @@ def schema(): .build() ) + def schema_numeric_fields(): return ( SchemaBuilder() @@ -22,6 +23,7 @@ def schema_numeric_fields(): .build() ) + def create_index(dir=None): # assume all tests will use the same documents for now # other methods may set up function-local indexes @@ -75,6 +77,7 @@ def create_index(dir=None): index.reload() return index + def create_index_with_numeric_fields(dir=None): index = Index(schema_numeric_fields(), dir) writer = index.writer(10_000_000, 1) @@ -116,11 +119,12 @@ def create_index_with_numeric_fields(dir=None): index.reload() return index + def spanish_schema(): return ( SchemaBuilder() - .add_text_field("title", stored=True, tokenizer_name='es_stem') - .add_text_field("body", tokenizer_name='es_stem') + .add_text_field("title", stored=True, tokenizer_name="es_stem") + .add_text_field("body", tokenizer_name="es_stem") .build() ) @@ -247,7 +251,7 @@ class TestClass(object): float_query = index.parse_query("3.5", ["rating"]) result = searcher.search(float_query) assert len(result.hits) == 1 - assert searcher.doc(result.hits[0][1])['rating'][0] == 3.5 + assert searcher.doc(result.hits[0][1])["rating"][0] == 3.5 integer_query = index.parse_query("1", ["id"]) result = searcher.search(integer_query) @@ -351,6 +355,67 @@ class TestClass(object): result = searcher.search(query, 10, order_by_field="order") assert len(result.hits) == 0 + def test_doc_from_dict_schema_validation(self): + schema = ( + SchemaBuilder() + .add_unsigned_field("unsigned") + .add_integer_field("signed") + .add_float_field("float") + .build() + ) + + good = Document.from_dict( + {"unsigned": 1000, "signed": -5, "float": 0.4}, + schema, + ) + + good = Document.from_dict( + {"unsigned": 1000, "signed": -5, "float": 0.4}, + schema, + ) + + with pytest.raises(ValueError): + bad = Document.from_dict( + {"unsigned": -50, "signed": -5, "float": 0.4}, + schema, + ) + + with pytest.raises(ValueError): + bad = Document.from_dict( + {"unsigned": 1000, "signed": 50.4, "float": 0.4}, + schema, + ) + + with pytest.raises(ValueError): + bad = Document.from_dict( + { + "unsigned": 1000, + "signed": -5, + "float": "bad_string", + }, + schema, + ) + + with pytest.raises(ValueError): + bad = Document.from_dict( + { + "unsigned": [1000, -50], + "signed": -5, + "float": 0.4, + }, + schema, + ) + + with pytest.raises(ValueError): + bad = Document.from_dict( + { + "unsigned": 1000, + "signed": [-5, 150, -3.14], + "float": 0.4, + }, + schema, + ) + class TestUpdateClass(object): def test_delete_update(self, ram_index): @@ -534,14 +599,17 @@ class TestJsonField: # assert len(result.hits) == 1 -@pytest.mark.parametrize('bytes_kwarg', [True, False]) -@pytest.mark.parametrize('bytes_payload', [ - b"abc", - bytearray(b"abc"), - memoryview(b"abc"), - BytesIO(b"abc").read(), - BytesIO(b"abc").getbuffer(), -]) +@pytest.mark.parametrize("bytes_kwarg", [True, False]) +@pytest.mark.parametrize( + "bytes_payload", + [ + b"abc", + bytearray(b"abc"), + memoryview(b"abc"), + BytesIO(b"abc").read(), + BytesIO(b"abc").getbuffer(), + ], +) def test_bytes(bytes_kwarg, bytes_payload): schema = SchemaBuilder().add_bytes_field("embedding").build() index = Index(schema)