Add schema validation to PyDict -> Document (#88)

* Add schema validation to PyDict -> Document

* Address comments

* Add documentation about new functionality
master
Chris Tam 2023-07-21 18:13:03 -04:00 committed by GitHub
parent a266f41974
commit b377f570ef
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 532 additions and 118 deletions

336
Cargo.lock generated
View File

@ -4,20 +4,20 @@ version = 3
[[package]] [[package]]
name = "ahash" name = "ahash"
version = "0.7.6" version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47" checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f"
dependencies = [ dependencies = [
"getrandom", "cfg-if",
"once_cell", "once_cell",
"version_check", "version_check",
] ]
[[package]] [[package]]
name = "aho-corasick" name = "aho-corasick"
version = "0.7.20" version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac" checksum = "43f6cb1bf222025340178f382c426f13757b2960e89779dfcb319c32542a5a41"
dependencies = [ dependencies = [
"memchr", "memchr",
] ]
@ -56,9 +56,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
[[package]] [[package]]
name = "base64" name = "base64"
version = "0.13.1" version = "0.21.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" checksum = "604178f6c5c21f02dc555784810edfb88d34ac2c73b2eae109655649ee73ce3d"
[[package]] [[package]]
name = "bitflags" name = "bitflags"
@ -66,6 +66,12 @@ version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
[[package]]
name = "bitflags"
version = "2.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42"
[[package]] [[package]]
name = "bitpacking" name = "bitpacking"
version = "0.8.4" version = "0.8.4"
@ -92,6 +98,9 @@ name = "cc"
version = "1.0.79" version = "1.0.79"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
dependencies = [
"jobserver",
]
[[package]] [[package]]
name = "census" name = "census"
@ -259,6 +268,27 @@ version = "1.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91" checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91"
[[package]]
name = "errno"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a"
dependencies = [
"errno-dragonfly",
"libc",
"windows-sys 0.48.0",
]
[[package]]
name = "errno-dragonfly"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
dependencies = [
"cc",
"libc",
]
[[package]] [[package]]
name = "fail" name = "fail"
version = "0.5.1" version = "0.5.1"
@ -276,20 +306,6 @@ version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "25c7df09945d65ea8d70b3321547ed414bbc540aad5bac6883d021b970f35b04" checksum = "25c7df09945d65ea8d70b3321547ed414bbc540aad5bac6883d021b970f35b04"
[[package]]
name = "fastfield_codecs"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "374a3a53c1bd5fb31b10084229290eafb0a05f260ec90f1f726afffda4877a8a"
dependencies = [
"fastdivide",
"itertools",
"log",
"ownedbytes",
"tantivy-bitpacker",
"tantivy-common",
]
[[package]] [[package]]
name = "fastrand" name = "fastrand"
version = "1.8.0" version = "1.8.0"
@ -300,13 +316,19 @@ dependencies = [
] ]
[[package]] [[package]]
name = "fs2" name = "fnv"
version = "0.4.3" version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
[[package]]
name = "fs4"
version = "0.6.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2eeb4ed9e12f43b7fa0baae3f9cdda28352770132ef2e09a23760c29cae8bd47"
dependencies = [ dependencies = [
"libc", "rustix",
"winapi", "windows-sys 0.48.0",
] ]
[[package]] [[package]]
@ -424,9 +446,9 @@ dependencies = [
[[package]] [[package]]
name = "hashbrown" name = "hashbrown"
version = "0.12.3" version = "0.13.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e"
dependencies = [ dependencies = [
"ahash", "ahash",
] ]
@ -503,6 +525,15 @@ version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fad582f4b9e86b6caa621cabeb0963332d92eea04729ab12892c2533951e6440" checksum = "fad582f4b9e86b6caa621cabeb0963332d92eea04729ab12892c2533951e6440"
[[package]]
name = "jobserver"
version = "0.1.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2"
dependencies = [
"libc",
]
[[package]] [[package]]
name = "js-sys" name = "js-sys"
version = "0.3.61" version = "0.3.61"
@ -526,9 +557,9 @@ checksum = "0c2cdeb66e45e9f36bfad5bbdb4d2384e70936afbee843c6f6543f0c551ebb25"
[[package]] [[package]]
name = "libc" name = "libc"
version = "0.2.139" version = "0.2.147"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79" checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3"
[[package]] [[package]]
name = "link-cplusplus" name = "link-cplusplus"
@ -539,6 +570,12 @@ dependencies = [
"cc", "cc",
] ]
[[package]]
name = "linux-raw-sys"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09fc20d2ca12cb9f044c93e3bd6d32d523e6e2ec3db4f7b2939cd99026ecd3f0"
[[package]] [[package]]
name = "lock_api" name = "lock_api"
version = "0.4.9" version = "0.4.9"
@ -574,18 +611,18 @@ dependencies = [
[[package]] [[package]]
name = "lru" name = "lru"
version = "0.7.8" version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e999beba7b6e8345721bd280141ed958096a2e4abdf74f67ff4ce49b4b54e47a" checksum = "718e8fae447df0c7e1ba7f5189829e63fd536945c8988d61444c19039f16b670"
dependencies = [ dependencies = [
"hashbrown", "hashbrown",
] ]
[[package]] [[package]]
name = "lz4_flex" name = "lz4_flex"
version = "0.9.5" version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a8cbbb2831780bc3b9c15a41f5b49222ef756b6730a95f3decfdd15903eb5a3" checksum = "8b8c72594ac26bfd34f2d99dfced2edfaddfe8a476e3ff2ca0eb293d925c4f83"
[[package]] [[package]]
name = "matchers" name = "matchers"
@ -614,9 +651,9 @@ checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
[[package]] [[package]]
name = "memmap2" name = "memmap2"
version = "0.5.8" version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4b182332558b18d807c4ce1ca8ca983b34c3ee32765e47b3f0f69b90355cc1dc" checksum = "6d28bba84adfe6646737845bc5ebbfa2c08424eb1c37e94a1fd2a82adb56a872"
dependencies = [ dependencies = [
"libc", "libc",
] ]
@ -641,12 +678,9 @@ dependencies = [
[[package]] [[package]]
name = "murmurhash32" name = "murmurhash32"
version = "0.2.0" version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d736ff882f0e85fe9689fb23db229616c4c00aee2b3ac282f666d8f20eb25d4a" checksum = "d9380db4c04d219ac5c51d14996bbf2c2e9a15229771b53f8671eb6c83cf44df"
dependencies = [
"byteorder",
]
[[package]] [[package]]
name = "nu-ansi-term" name = "nu-ansi-term"
@ -710,9 +744,9 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
[[package]] [[package]]
name = "ownedbytes" name = "ownedbytes"
version = "0.4.0" version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e957eaa64a299f39755416e5b3128c505e9d63a91d0453771ad2ccd3907f8db" checksum = "c718e498b20704d5fb5d51d07f414a22f61c19254c1708e117b93fd76860739c"
dependencies = [ dependencies = [
"stable_deref_trait", "stable_deref_trait",
] ]
@ -737,7 +771,7 @@ dependencies = [
"libc", "libc",
"redox_syscall", "redox_syscall",
"smallvec", "smallvec",
"windows-sys", "windows-sys 0.45.0",
] ]
[[package]] [[package]]
@ -752,6 +786,12 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
[[package]]
name = "pkg-config"
version = "0.3.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964"
[[package]] [[package]]
name = "ppv-lite86" name = "ppv-lite86"
version = "0.2.17" version = "0.2.17"
@ -774,6 +814,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06a3d8e8a46ab2738109347433cb7b96dffda2e4a218b03ef27090238886b147" checksum = "06a3d8e8a46ab2738109347433cb7b96dffda2e4a218b03ef27090238886b147"
dependencies = [ dependencies = [
"cfg-if", "cfg-if",
"chrono",
"indoc", "indoc",
"libc", "libc",
"memoffset 0.8.0", "memoffset 0.8.0",
@ -894,7 +935,7 @@ version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
dependencies = [ dependencies = [
"bitflags", "bitflags 1.3.2",
] ]
[[package]] [[package]]
@ -946,6 +987,19 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
[[package]]
name = "rustix"
version = "0.38.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0a962918ea88d644592894bc6dc55acc6c0956488adcebbfb6e273506b7fd6e5"
dependencies = [
"bitflags 2.3.3",
"errno",
"libc",
"linux-raw-sys",
"windows-sys 0.48.0",
]
[[package]] [[package]]
name = "rustversion" name = "rustversion"
version = "1.0.11" version = "1.0.11"
@ -1016,6 +1070,15 @@ dependencies = [
"lazy_static", "lazy_static",
] ]
[[package]]
name = "sketches-ddsketch"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "68a406c1882ed7f29cd5e248c9848a80e7cb6ae0fea82346d2746f2f941c07e1"
dependencies = [
"serde",
]
[[package]] [[package]]
name = "slab" name = "slab"
version = "0.4.7" version = "0.4.7"
@ -1050,7 +1113,7 @@ dependencies = [
[[package]] [[package]]
name = "tantivy" name = "tantivy"
version = "0.19.2" version = "0.20.1"
dependencies = [ dependencies = [
"chrono", "chrono",
"futures", "futures",
@ -1058,14 +1121,14 @@ dependencies = [
"pyo3", "pyo3",
"pyo3-build-config", "pyo3-build-config",
"serde_json", "serde_json",
"tantivy 0.19.2 (registry+https://github.com/rust-lang/crates.io-index)", "tantivy 0.20.2",
] ]
[[package]] [[package]]
name = "tantivy" name = "tantivy"
version = "0.19.2" version = "0.20.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5bb26a6b22c84d8be41d99a14016d6f04d30d8d31a2ea411a8ab553af5cc490d" checksum = "aec540e9cebc88f523f67f596dee213e491f0c55961de013566f267a0c31f5e9"
dependencies = [ dependencies = [
"aho-corasick", "aho-corasick",
"arc-swap", "arc-swap",
@ -1079,8 +1142,7 @@ dependencies = [
"downcast-rs", "downcast-rs",
"fail", "fail",
"fastdivide", "fastdivide",
"fastfield_codecs", "fs4",
"fs2",
"htmlescape", "htmlescape",
"itertools", "itertools",
"levenshtein_automata", "levenshtein_automata",
@ -1093,19 +1155,21 @@ dependencies = [
"num_cpus", "num_cpus",
"once_cell", "once_cell",
"oneshot", "oneshot",
"ownedbytes",
"rayon", "rayon",
"regex", "regex",
"rust-stemmers", "rust-stemmers",
"rustc-hash", "rustc-hash",
"serde", "serde",
"serde_json", "serde_json",
"sketches-ddsketch",
"smallvec", "smallvec",
"stable_deref_trait",
"tantivy-bitpacker", "tantivy-bitpacker",
"tantivy-columnar",
"tantivy-common", "tantivy-common",
"tantivy-fst", "tantivy-fst",
"tantivy-query-grammar", "tantivy-query-grammar",
"tantivy-stacker",
"tantivy-tokenizer-api",
"tempfile", "tempfile",
"thiserror", "thiserror",
"time 0.3.17", "time 0.3.17",
@ -1115,18 +1179,40 @@ dependencies = [
[[package]] [[package]]
name = "tantivy-bitpacker" name = "tantivy-bitpacker"
version = "0.3.0" version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e71a0c95b82d4292b097a09b989a6380d28c3a86800c841a2d03bae1fc8b9fa6" checksum = "16099e96f0ede682084469b80d6909dc170aa2b11d2a45538b5b36b2a90090b9"
dependencies = [
"bitpacking",
]
[[package]]
name = "tantivy-columnar"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56e32b024b26eab93eb8648faf08004356bf9d47376557ee4409f4b210163656"
dependencies = [
"fastdivide",
"fnv",
"itertools",
"serde",
"tantivy-bitpacker",
"tantivy-common",
"tantivy-sstable",
"tantivy-stacker",
]
[[package]] [[package]]
name = "tantivy-common" name = "tantivy-common"
version = "0.4.0" version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14fef4182bb60df9a4b92cd8ecab39ba2e50a05542934af17eef1f49660705cb" checksum = "e7d12fdd6ec0f7e0962f129c03c696a85ec567734950cbb2b89af4a293ce342f"
dependencies = [ dependencies = [
"async-trait",
"byteorder", "byteorder",
"ownedbytes", "ownedbytes",
"serde",
"time 0.3.17",
] ]
[[package]] [[package]]
@ -1142,15 +1228,45 @@ dependencies = [
[[package]] [[package]]
name = "tantivy-query-grammar" name = "tantivy-query-grammar"
version = "0.19.0" version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "343e3ada4c1c480953f6960f8a21ce9c76611480ffdd4f4e230fdddce0fc5331" checksum = "106d8f78ad1da4f0fdd526a0760c326c0573510d4dedabeb1962d35a35879797"
dependencies = [ dependencies = [
"combine", "combine",
"once_cell", "once_cell",
"regex", "regex",
] ]
[[package]]
name = "tantivy-sstable"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eda34243d3ee64bd8f9ba74a3b0d05f4d07beff7767a727212e9b5a19c13dde7"
dependencies = [
"tantivy-common",
"tantivy-fst",
"zstd",
]
[[package]]
name = "tantivy-stacker"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "67b9e9470301b026ad3b95f79a791a2a3ee81f3ab16fbe412a9dd81ff834acf5"
dependencies = [
"murmurhash32",
"tantivy-common",
]
[[package]]
name = "tantivy-tokenizer-api"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "64186801b6e06b3a1c4275e23b517835ff4ecbb707318b838dc9de457c062200"
dependencies = [
"serde",
]
[[package]] [[package]]
name = "target-lexicon" name = "target-lexicon"
version = "0.12.6" version = "0.12.6"
@ -1482,7 +1598,16 @@ version = "0.45.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
dependencies = [ dependencies = [
"windows-targets", "windows-targets 0.42.1",
]
[[package]]
name = "windows-sys"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
dependencies = [
"windows-targets 0.48.1",
] ]
[[package]] [[package]]
@ -1491,21 +1616,42 @@ version = "0.42.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e2522491fbfcd58cc84d47aeb2958948c4b8982e9a2d8a2a35bbaed431390e7" checksum = "8e2522491fbfcd58cc84d47aeb2958948c4b8982e9a2d8a2a35bbaed431390e7"
dependencies = [ dependencies = [
"windows_aarch64_gnullvm", "windows_aarch64_gnullvm 0.42.1",
"windows_aarch64_msvc 0.42.1", "windows_aarch64_msvc 0.42.1",
"windows_i686_gnu 0.42.1", "windows_i686_gnu 0.42.1",
"windows_i686_msvc 0.42.1", "windows_i686_msvc 0.42.1",
"windows_x86_64_gnu 0.42.1", "windows_x86_64_gnu 0.42.1",
"windows_x86_64_gnullvm", "windows_x86_64_gnullvm 0.42.1",
"windows_x86_64_msvc 0.42.1", "windows_x86_64_msvc 0.42.1",
] ]
[[package]]
name = "windows-targets"
version = "0.48.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05d4b17490f70499f20b9e791dcf6a299785ce8af4d709018206dc5b4953e95f"
dependencies = [
"windows_aarch64_gnullvm 0.48.0",
"windows_aarch64_msvc 0.48.0",
"windows_i686_gnu 0.48.0",
"windows_i686_msvc 0.48.0",
"windows_x86_64_gnu 0.48.0",
"windows_x86_64_gnullvm 0.48.0",
"windows_x86_64_msvc 0.48.0",
]
[[package]] [[package]]
name = "windows_aarch64_gnullvm" name = "windows_aarch64_gnullvm"
version = "0.42.1" version = "0.42.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608" checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608"
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc"
[[package]] [[package]]
name = "windows_aarch64_msvc" name = "windows_aarch64_msvc"
version = "0.39.0" version = "0.39.0"
@ -1518,6 +1664,12 @@ version = "0.42.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7" checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7"
[[package]]
name = "windows_aarch64_msvc"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3"
[[package]] [[package]]
name = "windows_i686_gnu" name = "windows_i686_gnu"
version = "0.39.0" version = "0.39.0"
@ -1530,6 +1682,12 @@ version = "0.42.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640" checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640"
[[package]]
name = "windows_i686_gnu"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241"
[[package]] [[package]]
name = "windows_i686_msvc" name = "windows_i686_msvc"
version = "0.39.0" version = "0.39.0"
@ -1542,6 +1700,12 @@ version = "0.42.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605" checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605"
[[package]]
name = "windows_i686_msvc"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00"
[[package]] [[package]]
name = "windows_x86_64_gnu" name = "windows_x86_64_gnu"
version = "0.39.0" version = "0.39.0"
@ -1554,12 +1718,24 @@ version = "0.42.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45" checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45"
[[package]]
name = "windows_x86_64_gnu"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1"
[[package]] [[package]]
name = "windows_x86_64_gnullvm" name = "windows_x86_64_gnullvm"
version = "0.42.1" version = "0.42.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463" checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953"
[[package]] [[package]]
name = "windows_x86_64_msvc" name = "windows_x86_64_msvc"
version = "0.39.0" version = "0.39.0"
@ -1571,3 +1747,39 @@ name = "windows_x86_64_msvc"
version = "0.42.1" version = "0.42.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd" checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd"
[[package]]
name = "windows_x86_64_msvc"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a"
[[package]]
name = "zstd"
version = "0.12.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c"
dependencies = [
"zstd-safe",
]
[[package]]
name = "zstd-safe"
version = "6.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581"
dependencies = [
"libc",
"zstd-sys",
]
[[package]]
name = "zstd-sys"
version = "2.0.8+zstd.1.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c"
dependencies = [
"cc",
"libc",
"pkg-config",
]

View File

@ -22,4 +22,4 @@ serde_json = "1.0.91"
[dependencies.pyo3] [dependencies.pyo3]
version = "0.18.0" version = "0.18.0"
features = ["extension-module"] features = ["chrono", "extension-module"]

View File

@ -9,11 +9,11 @@ use pyo3::{
}, },
}; };
use chrono::{offset::TimeZone, Utc}; use chrono::{offset::TimeZone, NaiveDateTime, Utc};
use tantivy as tv; use tantivy as tv;
use crate::{facet::Facet, to_pyerr}; use crate::{facet::Facet, schema::Schema, to_pyerr};
use serde_json::Value as JsonValue; use serde_json::Value as JsonValue;
use std::{ use std::{
collections::{BTreeMap, HashMap}, collections::{BTreeMap, HashMap},
@ -128,7 +128,25 @@ fn value_to_string(value: &Value) -> String {
/// ///
/// Example: /// Example:
/// >>> doc = tantivy.Document(title="The Old Man and the Sea", body="...") /// >>> doc = tantivy.Document(title="The Old Man and the Sea", body="...")
///
/// For numeric fields, the [`Document`] constructor does not have any
/// information about the type and will try to guess the type.
/// Therefore, it is recommended to use the [`Document::from_dict()`],
/// [`Document::extract()`], or `Document::add_*()` functions to provide
/// explicit type information.
///
/// Example:
/// >>> schema = (
/// SchemaBuilder()
/// .add_unsigned_field("unsigned")
/// .add_integer_field("signed")
/// .add_float_field("float")
/// .build()
/// )
/// >>> doc = tantivy.Document.from_dict(
/// {"unsigned": 1000, "signed": -5, "float": 0.4},
/// schema,
/// )
#[pyclass] #[pyclass]
#[derive(Default)] #[derive(Default)]
pub(crate) struct Document { pub(crate) struct Document {
@ -175,18 +193,7 @@ pub(crate) fn extract_value(any: &PyAny) -> PyResult<Value> {
if let Ok(num) = any.extract::<f64>() { if let Ok(num) = any.extract::<f64>() {
return Ok(Value::F64(num)); return Ok(Value::F64(num));
} }
if let Ok(py_datetime) = any.downcast::<PyDateTime>() { if let Ok(datetime) = any.extract::<NaiveDateTime>() {
let datetime = Utc
.with_ymd_and_hms(
py_datetime.get_year(),
py_datetime.get_month().into(),
py_datetime.get_day().into(),
py_datetime.get_hour().into(),
py_datetime.get_minute().into(),
py_datetime.get_second().into(),
)
.single()
.unwrap();
return Ok(Value::Date(tv::DateTime::from_timestamp_secs( return Ok(Value::Date(tv::DateTime::from_timestamp_secs(
datetime.timestamp(), datetime.timestamp(),
))); )));
@ -200,6 +207,60 @@ pub(crate) fn extract_value(any: &PyAny) -> PyResult<Value> {
Err(to_pyerr(format!("Value unsupported {any:?}"))) Err(to_pyerr(format!("Value unsupported {any:?}")))
} }
pub(crate) fn extract_value_for_type(
any: &PyAny,
tv_type: tv::schema::Type,
field_name: &str,
) -> PyResult<Value> {
// Helper function to create `PyErr`s returned by this function.
fn to_pyerr_for_type<'a, E: std::error::Error>(
type_name: &'a str,
field_name: &'a str,
any: &'a PyAny,
) -> impl Fn(E) -> PyErr + 'a {
move |_| {
to_pyerr(format!(
"Expected {} type for field {}, got {:?}",
type_name, field_name, any
))
}
}
let value = match tv_type {
tv::schema::Type::Str => Value::Str(
any.extract::<String>()
.map_err(to_pyerr_for_type("Str", field_name, any))?,
),
tv::schema::Type::U64 => Value::U64(
any.extract::<u64>()
.map_err(to_pyerr_for_type("U64", field_name, any))?,
),
tv::schema::Type::I64 => Value::I64(
any.extract::<i64>()
.map_err(to_pyerr_for_type("I64", field_name, any))?,
),
tv::schema::Type::F64 => Value::F64(
any.extract::<f64>()
.map_err(to_pyerr_for_type("F64", field_name, any))?,
),
tv::schema::Type::Date => {
let datetime = any
.extract::<NaiveDateTime>()
.map_err(to_pyerr_for_type("DateTime", field_name, any))?;
Value::Date(tv::DateTime::from_timestamp_secs(datetime.timestamp()))
}
tv::schema::Type::Facet => Value::Facet(
any.extract::<Facet>()
.map_err(to_pyerr_for_type("Facet", field_name, any))?
.inner,
),
_ => return Err(to_pyerr(format!("Value unsupported {:?}", any))),
};
Ok(value)
}
fn extract_value_single_or_list(any: &PyAny) -> PyResult<Vec<Value>> { fn extract_value_single_or_list(any: &PyAny) -> PyResult<Vec<Value>> {
if let Ok(values) = any.downcast::<PyList>() { if let Ok(values) = any.downcast::<PyList>() {
values.iter().map(extract_value).collect() values.iter().map(extract_value).collect()
@ -208,51 +269,124 @@ fn extract_value_single_or_list(any: &PyAny) -> PyResult<Vec<Value>> {
} }
} }
fn extract_value_single_or_list_for_type(
any: &PyAny,
field_type: &tv::schema::FieldType,
field_name: &str,
) -> PyResult<Vec<Value>> {
// Check if a numeric fast field supports multivalues.
if let Ok(values) = any.downcast::<PyList>() {
values
.iter()
.map(|any| {
extract_value_for_type(any, field_type.value_type(), field_name)
})
.collect()
} else {
Ok(vec![extract_value_for_type(
any,
field_type.value_type(),
field_name,
)?])
}
}
impl Document {
fn extract_py_values_from_dict(
py_dict: &PyDict,
schema: Option<&Schema>,
out_field_values: &mut BTreeMap<String, Vec<tv::schema::Value>>,
) -> PyResult<()> {
// TODO: Reserve when https://github.com/rust-lang/rust/issues/72631 is stable.
// out_field_values.reserve(py_dict.len());
for key_value_any in py_dict.items() {
if let Ok(key_value) = key_value_any.downcast::<PyTuple>() {
if key_value.len() != 2 {
continue;
}
let key = key_value.get_item(0)?.extract::<String>()?;
let field_type = if let Some(schema) = schema {
let field_type = schema
.inner
.get_field(key.as_str())
.map(|field| {
schema.inner.get_field_entry(field).field_type()
})
.ok();
if let Some(field_type) = field_type {
// A field type was found, so validate it after the values are extracted.
Some(field_type)
} else {
// The field does not exist in the schema, so skip over it.
continue;
}
} else {
// No schema was provided, so do not validate anything.
None
};
let value_list = if let Some(field_type) = field_type {
extract_value_single_or_list_for_type(
key_value.get_item(1)?,
field_type,
key.as_str(),
)?
} else {
extract_value_single_or_list(key_value.get_item(1)?)?
};
out_field_values.insert(key, value_list);
}
}
Ok(())
}
}
#[pymethods] #[pymethods]
impl Document { impl Document {
/// Creates a new document with optional fields from `**kwargs`.
///
/// Note that the types of numeric fields are unknown here. To
/// provide explicit type information, use the [`from_dict()`],
/// [`extend()`], or `add_<type>()` functions.
#[new] #[new]
#[pyo3(signature = (**kwargs))] #[pyo3(signature = (**kwargs))]
fn new(kwargs: Option<&PyDict>) -> PyResult<Self> { fn new(kwargs: Option<&PyDict>) -> PyResult<Self> {
let mut document = Document::default(); let mut document = Document::default();
if let Some(field_dict) = kwargs { if let Some(field_dict) = kwargs {
document.extend(field_dict)?; document.extend(field_dict, None)?;
} }
Ok(document) Ok(document)
} }
fn extend(&mut self, py_dict: &PyDict) -> PyResult<()> { fn extend(
let mut field_values: BTreeMap<String, Vec<tv::schema::Value>> = &mut self,
BTreeMap::new(); py_dict: &PyDict,
for key_value_any in py_dict.items() { schema: Option<&Schema>,
if let Ok(key_value) = key_value_any.downcast::<PyTuple>() { ) -> PyResult<()> {
if key_value.len() != 2 { Document::extract_py_values_from_dict(
continue; py_dict,
} schema,
let key: String = key_value.get_item(0)?.extract()?; &mut self.field_values,
let value_list = )
extract_value_single_or_list(key_value.get_item(1)?)?;
field_values.insert(key, value_list);
}
}
self.field_values.extend(field_values.into_iter());
Ok(())
} }
#[staticmethod] #[staticmethod]
fn from_dict(py_dict: &PyDict) -> PyResult<Document> { fn from_dict(
py_dict: &PyDict,
schema: Option<&Schema>,
) -> PyResult<Document> {
let mut field_values: BTreeMap<String, Vec<tv::schema::Value>> = let mut field_values: BTreeMap<String, Vec<tv::schema::Value>> =
BTreeMap::new(); BTreeMap::new();
for key_value_any in py_dict.items() { Document::extract_py_values_from_dict(
if let Ok(key_value) = key_value_any.downcast::<PyTuple>() { py_dict,
if key_value.len() != 2 { schema,
continue; &mut field_values,
} )?;
let key: String = key_value.get_item(0)?.extract()?;
let value_list =
extract_value_single_or_list(key_value.get_item(1)?)?;
field_values.insert(key, value_list);
}
}
Ok(Document { field_values }) Ok(Document { field_values })
} }

View File

@ -13,6 +13,7 @@ def schema():
.build() .build()
) )
def schema_numeric_fields(): def schema_numeric_fields():
return ( return (
SchemaBuilder() SchemaBuilder()
@ -22,6 +23,7 @@ def schema_numeric_fields():
.build() .build()
) )
def create_index(dir=None): def create_index(dir=None):
# assume all tests will use the same documents for now # assume all tests will use the same documents for now
# other methods may set up function-local indexes # other methods may set up function-local indexes
@ -75,6 +77,7 @@ def create_index(dir=None):
index.reload() index.reload()
return index return index
def create_index_with_numeric_fields(dir=None): def create_index_with_numeric_fields(dir=None):
index = Index(schema_numeric_fields(), dir) index = Index(schema_numeric_fields(), dir)
writer = index.writer(10_000_000, 1) writer = index.writer(10_000_000, 1)
@ -116,11 +119,12 @@ def create_index_with_numeric_fields(dir=None):
index.reload() index.reload()
return index return index
def spanish_schema(): def spanish_schema():
return ( return (
SchemaBuilder() SchemaBuilder()
.add_text_field("title", stored=True, tokenizer_name='es_stem') .add_text_field("title", stored=True, tokenizer_name="es_stem")
.add_text_field("body", tokenizer_name='es_stem') .add_text_field("body", tokenizer_name="es_stem")
.build() .build()
) )
@ -247,7 +251,7 @@ class TestClass(object):
float_query = index.parse_query("3.5", ["rating"]) float_query = index.parse_query("3.5", ["rating"])
result = searcher.search(float_query) result = searcher.search(float_query)
assert len(result.hits) == 1 assert len(result.hits) == 1
assert searcher.doc(result.hits[0][1])['rating'][0] == 3.5 assert searcher.doc(result.hits[0][1])["rating"][0] == 3.5
integer_query = index.parse_query("1", ["id"]) integer_query = index.parse_query("1", ["id"])
result = searcher.search(integer_query) result = searcher.search(integer_query)
@ -351,6 +355,67 @@ class TestClass(object):
result = searcher.search(query, 10, order_by_field="order") result = searcher.search(query, 10, order_by_field="order")
assert len(result.hits) == 0 assert len(result.hits) == 0
def test_doc_from_dict_schema_validation(self):
schema = (
SchemaBuilder()
.add_unsigned_field("unsigned")
.add_integer_field("signed")
.add_float_field("float")
.build()
)
good = Document.from_dict(
{"unsigned": 1000, "signed": -5, "float": 0.4},
schema,
)
good = Document.from_dict(
{"unsigned": 1000, "signed": -5, "float": 0.4},
schema,
)
with pytest.raises(ValueError):
bad = Document.from_dict(
{"unsigned": -50, "signed": -5, "float": 0.4},
schema,
)
with pytest.raises(ValueError):
bad = Document.from_dict(
{"unsigned": 1000, "signed": 50.4, "float": 0.4},
schema,
)
with pytest.raises(ValueError):
bad = Document.from_dict(
{
"unsigned": 1000,
"signed": -5,
"float": "bad_string",
},
schema,
)
with pytest.raises(ValueError):
bad = Document.from_dict(
{
"unsigned": [1000, -50],
"signed": -5,
"float": 0.4,
},
schema,
)
with pytest.raises(ValueError):
bad = Document.from_dict(
{
"unsigned": 1000,
"signed": [-5, 150, -3.14],
"float": 0.4,
},
schema,
)
class TestUpdateClass(object): class TestUpdateClass(object):
def test_delete_update(self, ram_index): def test_delete_update(self, ram_index):
@ -534,14 +599,17 @@ class TestJsonField:
# assert len(result.hits) == 1 # assert len(result.hits) == 1
@pytest.mark.parametrize('bytes_kwarg', [True, False]) @pytest.mark.parametrize("bytes_kwarg", [True, False])
@pytest.mark.parametrize('bytes_payload', [ @pytest.mark.parametrize(
b"abc", "bytes_payload",
bytearray(b"abc"), [
memoryview(b"abc"), b"abc",
BytesIO(b"abc").read(), bytearray(b"abc"),
BytesIO(b"abc").getbuffer(), memoryview(b"abc"),
]) BytesIO(b"abc").read(),
BytesIO(b"abc").getbuffer(),
],
)
def test_bytes(bytes_kwarg, bytes_payload): def test_bytes(bytes_kwarg, bytes_payload):
schema = SchemaBuilder().add_bytes_field("embedding").build() schema = SchemaBuilder().add_bytes_field("embedding").build()
index = Index(schema) index = Index(schema)