feat: upgrade tantivy to 0.22 (#242)

master
Caleb Hattingh 2024-05-03 23:35:19 +02:00 committed by GitHub
parent 9fafdf25cb
commit 983364b3a5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 278 additions and 136 deletions

265
Cargo.lock generated
View File

@ -69,9 +69,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
[[package]]
name = "base64"
version = "0.21.5"
version = "0.22.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "35636a1494ede3b646cc98f74f8e62c773a38a659ebc777a2cf26b9b74171df9"
checksum = "9475866fec1451be56a3c2400fd081ff546538961565ccb5b7142cbd22bc7a51"
[[package]]
name = "bitflags"
@ -87,9 +87,9 @@ checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
[[package]]
name = "bitpacking"
version = "0.8.4"
version = "0.9.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8c7d2ac73c167c06af4a5f37e6e59d84148d57ccbe4480b76f0273eefea82d7"
checksum = "4c1d3e2bfd8d06048a179f7b17afc3188effa10385e7b00dc65af6aae732ea92"
dependencies = [
"crunchy",
]
@ -118,9 +118,9 @@ dependencies = [
[[package]]
name = "census"
version = "0.4.1"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fafee10a5dd1cffcb5cc560e0d0df8803d7355a2b12272e3557dee57314cb6e"
checksum = "4f4c707c6a209cbe82d10abd08e1ea8995e9ea937d2550646e02798948992be0"
[[package]]
name = "cfg-if"
@ -139,7 +139,7 @@ dependencies = [
"js-sys",
"num-traits",
"wasm-bindgen",
"windows-targets",
"windows-targets 0.48.5",
]
[[package]]
@ -235,7 +235,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3e13f66a2f95e32a39eaa81f6b95d42878ca0e1db0c7543723dfe12557e860"
dependencies = [
"libc",
"windows-sys",
"windows-sys 0.48.0",
]
[[package]]
@ -258,12 +258,12 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
[[package]]
name = "fs4"
version = "0.6.6"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2eeb4ed9e12f43b7fa0baae3f9cdda28352770132ef2e09a23760c29cae8bd47"
checksum = "21dabded2e32cd57ded879041205c60a4a4c4bab47bd0fd2fa8b01f30849f02b"
dependencies = [
"rustix",
"windows-sys",
"windows-sys 0.52.0",
]
[[package]]
@ -448,15 +448,6 @@ dependencies = [
"web-sys",
]
[[package]]
name = "itertools"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57"
dependencies = [
"either",
]
[[package]]
name = "itertools"
version = "0.12.0"
@ -504,9 +495,15 @@ checksum = "0c2cdeb66e45e9f36bfad5bbdb4d2384e70936afbee843c6f6543f0c551ebb25"
[[package]]
name = "libc"
version = "0.2.149"
version = "0.2.153"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b"
checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd"
[[package]]
name = "libm"
version = "0.2.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
[[package]]
name = "linux-raw-sys"
@ -546,9 +543,9 @@ dependencies = [
[[package]]
name = "lru"
version = "0.11.1"
version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a4a83fb7698b3643a0e34f9ae6f2e8f0178c0fd42f8b59d493aa271ff3a5bf21"
checksum = "d3262e75e648fce39813cb56ac41f3c3e3f65217ebf3844d818d1f9398cfb0dc"
dependencies = [
"hashbrown",
]
@ -586,9 +583,9 @@ checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"
[[package]]
name = "memmap2"
version = "0.7.1"
version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f49388d20533534cd19360ad3d6a7dadc885944aa802ba3995040c5ec11288c6"
checksum = "fe751422e4a8caa417e13c3ea66452215d7d63e19e604f4980461212f3ae1322"
dependencies = [
"libc",
]
@ -641,6 +638,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c"
dependencies = [
"autocfg",
"libm",
]
[[package]]
@ -676,9 +674,9 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
[[package]]
name = "ownedbytes"
version = "0.6.0"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e8a72b918ae8198abb3a18c190288123e1d442b6b9a7d709305fd194688b4b7"
checksum = "c3a059efb063b8f425b948e042e6b9bd85edfe60e913630ed727b23e2dfcc558"
dependencies = [
"stable_deref_trait",
]
@ -703,7 +701,7 @@ dependencies = [
"libc",
"redox_syscall",
"smallvec",
"windows-targets",
"windows-targets 0.48.5",
]
[[package]]
@ -730,6 +728,12 @@ version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
[[package]]
name = "ppv-lite86"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
[[package]]
name = "proc-macro2"
version = "1.0.69"
@ -820,6 +824,46 @@ dependencies = [
"proc-macro2",
]
[[package]]
name = "rand"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
dependencies = [
"libc",
"rand_chacha",
"rand_core",
]
[[package]]
name = "rand_chacha"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
dependencies = [
"ppv-lite86",
"rand_core",
]
[[package]]
name = "rand_core"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
dependencies = [
"getrandom",
]
[[package]]
name = "rand_distr"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31"
dependencies = [
"num-traits",
"rand",
]
[[package]]
name = "rayon"
version = "1.8.0"
@ -919,7 +963,7 @@ dependencies = [
"errno",
"libc",
"linux-raw-sys",
"windows-sys",
"windows-sys 0.48.0",
]
[[package]]
@ -1034,24 +1078,23 @@ dependencies = [
"base64",
"chrono",
"futures",
"itertools 0.12.0",
"itertools",
"pyo3",
"pyo3-build-config",
"pythonize",
"serde",
"serde_json",
"tantivy 0.21.1",
"tantivy 0.22.0",
]
[[package]]
name = "tantivy"
version = "0.21.1"
version = "0.22.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d6083cd777fa94271b8ce0fe4533772cb8110c3044bab048d20f70108329a1f2"
checksum = "f8d0582f186c0a6d55655d24543f15e43607299425c5ad8352c242b914b31856"
dependencies = [
"aho-corasick",
"arc-swap",
"async-trait",
"base64",
"bitpacking",
"byteorder",
@ -1060,16 +1103,16 @@ dependencies = [
"crossbeam-channel",
"downcast-rs",
"fastdivide",
"fnv",
"fs4",
"htmlescape",
"itertools 0.11.0",
"itertools",
"levenshtein_automata",
"log",
"lru",
"lz4_flex",
"measure_time",
"memmap2",
"murmurhash32",
"num_cpus",
"once_cell",
"oneshot",
@ -1097,22 +1140,22 @@ dependencies = [
[[package]]
name = "tantivy-bitpacker"
version = "0.5.0"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cecb164321482301f514dd582264fa67f70da2d7eb01872ccd71e35e0d96655a"
checksum = "284899c2325d6832203ac6ff5891b297fc5239c3dc754c5bc1977855b23c10df"
dependencies = [
"bitpacking",
]
[[package]]
name = "tantivy-columnar"
version = "0.2.0"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8d85f8019af9a78b3118c11298b36ffd21c2314bd76bbcd9d12e00124cbb7e70"
checksum = "12722224ffbe346c7fec3275c699e508fd0d4710e629e933d5736ec524a1f44e"
dependencies = [
"downcast-rs",
"fastdivide",
"fnv",
"itertools 0.11.0",
"itertools",
"serde",
"tantivy-bitpacker",
"tantivy-common",
@ -1122,9 +1165,9 @@ dependencies = [
[[package]]
name = "tantivy-common"
version = "0.6.0"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "af4a3a975e604a2aba6b1106a04505e1e7a025e6def477fab6e410b4126471e1"
checksum = "8019e3cabcfd20a1380b491e13ff42f57bb38bf97c3d5fa5c07e50816e0621f4"
dependencies = [
"async-trait",
"byteorder",
@ -1135,30 +1178,31 @@ dependencies = [
[[package]]
name = "tantivy-fst"
version = "0.4.0"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fc3c506b1a8443a3a65352df6382a1fb6a7afe1a02e871cee0d25e2c3d5f3944"
checksum = "d60769b80ad7953d8a7b2c70cdfe722bbcdcac6bccc8ac934c40c034d866fc18"
dependencies = [
"byteorder",
"regex-syntax 0.6.29",
"regex-syntax 0.8.2",
"utf8-ranges",
]
[[package]]
name = "tantivy-query-grammar"
version = "0.21.0"
version = "0.22.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1d39c5a03100ac10c96e0c8b07538e2ab8b17da56434ab348309b31f23fada77"
checksum = "847434d4af57b32e309f4ab1b4f1707a6c566656264caa427ff4285c4d9d0b82"
dependencies = [
"nom",
]
[[package]]
name = "tantivy-sstable"
version = "0.2.0"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fc0c1bb43e5e8b8e05eb8009610344dbf285f06066c844032fbb3e546b3c71df"
checksum = "c69578242e8e9fc989119f522ba5b49a38ac20f576fc778035b96cc94f41f98e"
dependencies = [
"tantivy-bitpacker",
"tantivy-common",
"tantivy-fst",
"zstd",
@ -1166,19 +1210,20 @@ dependencies = [
[[package]]
name = "tantivy-stacker"
version = "0.2.0"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2c078595413f13f218cf6f97b23dcfd48936838f1d3d13a1016e05acd64ed6c"
checksum = "c56d6ff5591fc332739b3ce7035b57995a3ce29a93ffd6012660e0949c956ea8"
dependencies = [
"murmurhash32",
"rand_distr",
"tantivy-common",
]
[[package]]
name = "tantivy-tokenizer-api"
version = "0.2.0"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "347b6fb212b26d3505d224f438e3c4b827ab8bd847fe9953ad5ac6b8f9443b66"
checksum = "2a0dcade25819a89cfe6f17d932c9cedff11989936bf6dd4f336d50392053b04"
dependencies = [
"serde",
]
@ -1199,7 +1244,7 @@ dependencies = [
"fastrand",
"redox_syscall",
"rustix",
"windows-sys",
"windows-sys 0.48.0",
]
[[package]]
@ -1460,7 +1505,7 @@ version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f"
dependencies = [
"windows-targets",
"windows-targets 0.48.5",
]
[[package]]
@ -1469,7 +1514,7 @@ version = "0.51.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f1f8cf84f35d2db49a46868f947758c7a1138116f7fac3bc844f43ade1292e64"
dependencies = [
"windows-targets",
"windows-targets 0.48.5",
]
[[package]]
@ -1478,7 +1523,16 @@ version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
dependencies = [
"windows-targets",
"windows-targets 0.48.5",
]
[[package]]
name = "windows-sys"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
dependencies = [
"windows-targets 0.52.5",
]
[[package]]
@ -1487,13 +1541,29 @@ version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_gnullvm",
"windows_x86_64_msvc",
"windows_aarch64_gnullvm 0.48.5",
"windows_aarch64_msvc 0.48.5",
"windows_i686_gnu 0.48.5",
"windows_i686_msvc 0.48.5",
"windows_x86_64_gnu 0.48.5",
"windows_x86_64_gnullvm 0.48.5",
"windows_x86_64_msvc 0.48.5",
]
[[package]]
name = "windows-targets"
version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb"
dependencies = [
"windows_aarch64_gnullvm 0.52.5",
"windows_aarch64_msvc 0.52.5",
"windows_i686_gnu 0.52.5",
"windows_i686_gnullvm",
"windows_i686_msvc 0.52.5",
"windows_x86_64_gnu 0.52.5",
"windows_x86_64_gnullvm 0.52.5",
"windows_x86_64_msvc 0.52.5",
]
[[package]]
@ -1502,42 +1572,90 @@ version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263"
[[package]]
name = "windows_aarch64_msvc"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6"
[[package]]
name = "windows_i686_gnu"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
[[package]]
name = "windows_i686_gnu"
version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670"
[[package]]
name = "windows_i686_gnullvm"
version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9"
[[package]]
name = "windows_i686_msvc"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
[[package]]
name = "windows_i686_msvc"
version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf"
[[package]]
name = "windows_x86_64_gnu"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596"
[[package]]
name = "windows_x86_64_msvc"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0"
[[package]]
name = "zerocopy"
version = "0.7.31"
@ -1560,28 +1678,27 @@ dependencies = [
[[package]]
name = "zstd"
version = "0.12.4"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c"
checksum = "2d789b1514203a1120ad2429eae43a7bd32b90976a7bb8a05f7ec02fa88cc23a"
dependencies = [
"zstd-safe",
]
[[package]]
name = "zstd-safe"
version = "6.0.6"
version = "7.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581"
checksum = "1cd99b45c6bc03a018c8b8a86025678c87e55526064e38f9df301989dce7ec0a"
dependencies = [
"libc",
"zstd-sys",
]
[[package]]
name = "zstd-sys"
version = "2.0.9+zstd.1.5.5"
version = "2.0.10+zstd.1.5.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e16efa8a874a0481a574084d34cc26fdb3b99627480f785888deb6386506656"
checksum = "c253a4914af5bafc8fa8c86ee400827e83cf6ec01195ec1f1ed8441bf00d65aa"
dependencies = [
"cc",
"pkg-config",

View File

@ -14,9 +14,9 @@ crate-type = ["cdylib"]
pyo3-build-config = "0.20.0"
[dependencies]
base64 = "0.21"
base64 = "0.22"
chrono = "0.4.23"
tantivy = "0.21.0"
tantivy = "0.22.0"
itertools = "0.12.0"
futures = "0.3.26"
pythonize = "0.20.0"

View File

@ -14,15 +14,14 @@ use pyo3::{
use chrono::{offset::TimeZone, NaiveDateTime, Utc};
use tantivy::{self as tv, schema::Value};
use tantivy::{self as tv, schema::document::OwnedValue as Value};
use crate::{facet::Facet, schema::Schema, to_pyerr};
use serde::{
ser::SerializeMap, Deserialize, Deserializer, Serialize, Serializer,
};
use serde_json::Value as JsonValue;
use std::{
collections::{BTreeMap, HashMap},
collections::BTreeMap,
fmt,
net::{IpAddr, Ipv6Addr},
str::FromStr,
@ -54,7 +53,7 @@ pub(crate) fn extract_value(any: &PyAny) -> PyResult<Value> {
}
if let Ok(dict) = any.downcast::<PyDict>() {
if let Ok(json) = pythonize::depythonize(dict) {
return Ok(Value::JsonObject(json));
return Ok(Value::Object(json));
}
}
Err(to_pyerr(format!("Value unsupported {any:?}")))
@ -119,11 +118,11 @@ pub(crate) fn extract_value_for_type(
tv::schema::Type::Json => {
if let Ok(json_str) = any.extract::<&str>() {
return serde_json::from_str(json_str)
.map(Value::JsonObject)
.map(Value::Object)
.map_err(to_pyerr_for_type("Json", field_name, any));
}
Value::JsonObject(
Value::Object(
any.downcast::<PyDict>()
.map(|dict| pythonize::depythonize(dict))
.map_err(to_pyerr_for_type("Json", field_name, any))?
@ -192,32 +191,20 @@ fn extract_value_single_or_list_for_type(
}
}
fn value_to_object(val: &JsonValue, py: Python<'_>) -> PyObject {
match val {
JsonValue::Null => py.None(),
JsonValue::Bool(b) => b.to_object(py),
JsonValue::Number(n) => match n {
n if n.is_i64() => n.as_i64().to_object(py),
n if n.is_u64() => n.as_u64().to_object(py),
n if n.is_f64() => n.as_f64().to_object(py),
_ => panic!("number too large"),
},
JsonValue::String(s) => s.to_object(py),
JsonValue::Array(v) => {
let inner: Vec<_> =
v.iter().map(|x| value_to_object(x, py)).collect();
inner.to_object(py)
}
JsonValue::Object(m) => {
let inner: HashMap<_, _> =
m.iter().map(|(k, v)| (k, value_to_object(v, py))).collect();
inner.to_object(py)
}
fn object_to_py(
py: Python,
obj: &BTreeMap<String, Value>,
) -> PyResult<PyObject> {
let dict = PyDict::new(py);
for (k, v) in obj.iter() {
dict.set_item(k, value_to_py(py, v)?)?;
}
Ok(dict.into())
}
fn value_to_py(py: Python, value: &Value) -> PyResult<PyObject> {
Ok(match value {
Value::Null => py.None(),
Value::Str(text) => text.into_py(py),
Value::U64(num) => (*num).into_py(py),
Value::I64(num) => (*num).into_py(py),
@ -243,13 +230,11 @@ fn value_to_py(py: Python, value: &Value) -> PyResult<PyObject> {
.into_py(py)
}
Value::Facet(f) => Facet { inner: f.clone() }.into_py(py),
Value::JsonObject(json_object) => {
let inner: HashMap<_, _> = json_object
.iter()
.map(|(k, v)| (k, value_to_object(v, py)))
.collect();
inner.to_object(py)
Value::Array(arr) => {
// TODO implement me
unimplemented!();
}
Value::Object(obj) => object_to_py(py, obj)?,
Value::Bool(b) => b.into_py(py),
Value::IpAddr(i) => (*i).to_string().into_py(py),
})
@ -257,6 +242,7 @@ fn value_to_py(py: Python, value: &Value) -> PyResult<PyObject> {
fn value_to_string(value: &Value) -> String {
match value {
Value::Null => format!("{:?}", value),
Value::Str(text) => text.clone(),
Value::U64(num) => format!("{num}"),
Value::I64(num) => format!("{num}"),
@ -268,7 +254,11 @@ fn value_to_string(value: &Value) -> String {
// TODO implement me
unimplemented!();
}
Value::JsonObject(json_object) => {
Value::Array(arr) => {
let inner: Vec<_> = arr.iter().map(value_to_string).collect();
format!("{inner:?}")
}
Value::Object(json_object) => {
serde_json::to_string(&json_object).unwrap()
}
Value::Bool(b) => format!("{b}"),
@ -308,6 +298,8 @@ where
/// necessary for serialization.
#[derive(Deserialize, Serialize)]
enum SerdeValue {
/// Null
Null,
/// The str type is used for any text information.
Str(String),
/// Pre-tokenized str type,
@ -330,8 +322,10 @@ enum SerdeValue {
Facet(tv::schema::Facet),
/// Arbitrarily sized byte array
Bytes(Vec<u8>),
/// Json object value.
JsonObject(serde_json::Map<String, serde_json::Value>),
/// Array
Array(Vec<Value>),
/// Object value.
Object(BTreeMap<String, Value>),
/// IpV6 Address. Internally there is no IpV4, it needs to be converted to `Ipv6Addr`.
IpAddr(Ipv6Addr),
}
@ -339,6 +333,7 @@ enum SerdeValue {
impl From<SerdeValue> for Value {
fn from(value: SerdeValue) -> Self {
match value {
SerdeValue::Null => Self::Null,
SerdeValue::Str(v) => Self::Str(v),
SerdeValue::PreTokStr(v) => Self::PreTokStr(v),
SerdeValue::U64(v) => Self::U64(v),
@ -347,7 +342,8 @@ impl From<SerdeValue> for Value {
SerdeValue::Date(v) => Self::Date(v),
SerdeValue::Facet(v) => Self::Facet(v),
SerdeValue::Bytes(v) => Self::Bytes(v),
SerdeValue::JsonObject(v) => Self::JsonObject(v),
SerdeValue::Array(v) => Self::Array(v),
SerdeValue::Object(v) => Self::Object(v),
SerdeValue::Bool(v) => Self::Bool(v),
SerdeValue::IpAddr(v) => Self::IpAddr(v),
}
@ -357,6 +353,7 @@ impl From<SerdeValue> for Value {
impl From<Value> for SerdeValue {
fn from(value: Value) -> Self {
match value {
Value::Null => Self::Null,
Value::Str(v) => Self::Str(v),
Value::PreTokStr(v) => Self::PreTokStr(v),
Value::U64(v) => Self::U64(v),
@ -365,7 +362,8 @@ impl From<Value> for SerdeValue {
Value::Date(v) => Self::Date(v),
Value::Facet(v) => Self::Facet(v),
Value::Bytes(v) => Self::Bytes(v),
Value::JsonObject(v) => Self::JsonObject(v),
Value::Array(v) => Self::Array(v),
Value::Object(v) => Self::Object(v),
Value::Bool(v) => Self::Bool(v),
Value::IpAddr(v) => Self::IpAddr(v),
}
@ -376,6 +374,8 @@ impl From<Value> for SerdeValue {
/// cloning.
#[derive(Serialize)]
enum BorrowedSerdeValue<'a> {
/// Null
Null,
/// The str type is used for any text information.
Str(&'a str),
/// Pre-tokenized str type,
@ -395,8 +395,10 @@ enum BorrowedSerdeValue<'a> {
Facet(&'a tv::schema::Facet),
/// Arbitrarily sized byte array
Bytes(&'a [u8]),
/// Array
Array(&'a Vec<Value>),
/// Json object value.
JsonObject(&'a serde_json::Map<String, serde_json::Value>),
Object(&'a BTreeMap<String, Value>),
/// IpV6 Address. Internally there is no IpV4, it needs to be converted to `Ipv6Addr`.
IpAddr(&'a Ipv6Addr),
}
@ -404,6 +406,7 @@ enum BorrowedSerdeValue<'a> {
impl<'a> From<&'a Value> for BorrowedSerdeValue<'a> {
fn from(value: &'a Value) -> Self {
match value {
Value::Null => Self::Null,
Value::Str(v) => Self::Str(v),
Value::PreTokStr(v) => Self::PreTokStr(v),
Value::U64(v) => Self::U64(v),
@ -412,7 +415,8 @@ impl<'a> From<&'a Value> for BorrowedSerdeValue<'a> {
Value::Date(v) => Self::Date(v),
Value::Facet(v) => Self::Facet(v),
Value::Bytes(v) => Self::Bytes(v),
Value::JsonObject(v) => Self::JsonObject(v),
Value::Array(v) => Self::Array(v),
Value::Object(v) => Self::Object(v),
Value::Bool(v) => Self::Bool(v),
Value::IpAddr(v) => Self::IpAddr(v),
}
@ -559,8 +563,7 @@ impl Document {
py_dict: &PyDict,
schema: Option<&Schema>,
) -> PyResult<Document> {
let mut field_values: BTreeMap<String, Vec<tv::schema::Value>> =
BTreeMap::new();
let mut field_values: BTreeMap<String, Vec<Value>> = BTreeMap::new();
Document::extract_py_values_from_dict(
py_dict,
schema,
@ -809,7 +812,7 @@ impl Document {
fn extract_py_values_from_dict(
py_dict: &PyDict,
schema: Option<&Schema>,
out_field_values: &mut BTreeMap<String, Vec<tv::schema::Value>>,
out_field_values: &mut BTreeMap<String, Vec<Value>>,
) -> PyResult<()> {
// TODO: Reserve when https://github.com/rust-lang/rust/issues/72631 is stable.
// out_field_values.reserve(py_dict.len());

View File

@ -16,7 +16,10 @@ use crate::{
use tantivy as tv;
use tantivy::{
directory::MmapDirectory,
schema::{NamedFieldDocument, Term, Value},
schema::{
document::TantivyDocument, NamedFieldDocument, OwnedValue as Value,
Term,
},
tokenizer::{
Language, LowerCaser, RemoveLongFilter, SimpleTokenizer, Stemmer,
TextAnalyzer,
@ -73,7 +76,8 @@ impl IndexWriter {
/// since the creation of the index.
pub fn add_document(&mut self, doc: &Document) -> PyResult<u64> {
let named_doc = NamedFieldDocument(doc.field_values.clone());
let doc = self.schema.convert_named_doc(named_doc).map_err(to_pyerr)?;
let doc = TantivyDocument::convert_named_doc(&self.schema, named_doc)
.map_err(to_pyerr)?;
self.inner()?.add_document(doc).map_err(to_pyerr)
}
@ -86,7 +90,8 @@ impl IndexWriter {
/// The `opstamp` represents the number of documents that have been added
/// since the creation of the index.
pub fn add_json(&mut self, json: &str) -> PyResult<u64> {
let doc = self.schema.parse_document(json).map_err(to_pyerr)?;
let doc = TantivyDocument::parse_json(&self.schema, json)
.map_err(to_pyerr)?;
let opstamp = self.inner()?.add_document(doc);
opstamp.map_err(to_pyerr)
}
@ -154,6 +159,11 @@ impl IndexWriter {
let field = get_field(&self.schema, field_name)?;
let value = extract_value(field_value)?;
let term = match value {
Value::Null => {
return Err(exceptions::PyValueError::new_err(format!(
"Field `{field_name}` is null type not deletable."
)))
},
Value::Str(text) => Term::from_field_text(field, &text),
Value::U64(num) => Term::from_field_u64(field, num),
Value::I64(num) => Term::from_field_i64(field, num),
@ -170,7 +180,12 @@ impl IndexWriter {
"Field `{field_name}` is pretokenized. This is not authorized for delete."
)))
}
Value::JsonObject(_) => {
Value::Array(_) => {
return Err(exceptions::PyValueError::new_err(format!(
"Field `{field_name}` is array type not deletable."
)))
}
Value::Object(_) => {
return Err(exceptions::PyValueError::new_err(format!(
"Field `{field_name}` is json object type not deletable."
)))
@ -297,9 +312,9 @@ impl Index {
) -> Result<(), PyErr> {
let reload_policy = reload_policy.to_lowercase();
let reload_policy = match reload_policy.as_ref() {
"commit" => tv::ReloadPolicy::OnCommit,
"on-commit" => tv::ReloadPolicy::OnCommit,
"oncommit" => tv::ReloadPolicy::OnCommit,
"commit" => tv::ReloadPolicy::OnCommitWithDelay,
"on-commit" => tv::ReloadPolicy::OnCommitWithDelay,
"oncommit" => tv::ReloadPolicy::OnCommitWithDelay,
"manual" => tv::ReloadPolicy::Manual,
_ => return Err(exceptions::PyValueError::new_err(
"Invalid reload policy, valid choices are: 'manual' and 'OnCommit'"

View File

@ -1,5 +1,5 @@
use ::tantivy as tv;
use ::tantivy::schema::{Term, Value};
use ::tantivy::schema::{OwnedValue as Value, Term};
use pyo3::{exceptions, prelude::*, wrap_pymodule};
mod document;

View File

@ -319,7 +319,7 @@ impl ExpectedBase64Error {
/// If `true`, the length of the base64 string was invalid.
fn caused_by_invalid_length(&self) -> bool {
matches!(self.decode_error, base64::DecodeError::InvalidLength)
matches!(self.decode_error, base64::DecodeError::InvalidLength(_))
}
/// The last non-padding input symbol's encoded 6 bits have nonzero bits that will be discarded.

View File

@ -5,6 +5,11 @@ use pyo3::{basic::CompareOp, exceptions::PyValueError, prelude::*};
use serde::{Deserialize, Serialize};
use tantivy as tv;
use tantivy::collector::{Count, MultiCollector, TopDocs};
use tantivy::TantivyDocument;
// Bring the trait into scope. This is required for the `to_named_doc` method.
// However, tantivy-py declares its own `Document` class, so we need to avoid
// introduce the `Document` trait into the namespace.
use tantivy::Document as _;
/// Tantivy's Searcher class
///
@ -248,9 +253,10 @@ impl Searcher {
///
/// Returns the Document, raises ValueError if the document can't be found.
fn doc(&self, doc_address: &DocAddress) -> PyResult<Document> {
let doc = self.inner.doc(doc_address.into()).map_err(to_pyerr)?;
let named_doc = self.inner.schema().to_named_doc(&doc);
Ok(Document {
let doc: TantivyDocument =
self.inner.doc(doc_address.into()).map_err(to_pyerr)?;
let named_doc = doc.to_named_doc(self.inner.schema());
Ok(crate::document::Document {
field_values: named_doc.0,
})
}

View File

@ -1,6 +1,8 @@
use crate::to_pyerr;
use pyo3::prelude::*;
use tantivy as tv;
// Bring the trait into scope to use methods like `as_str()` on `OwnedValue`.
use tantivy::schema::Value;
/// Tantivy Snippet
///
@ -71,7 +73,7 @@ impl SnippetGenerator {
pub fn snippet_from_doc(&self, doc: &crate::Document) -> crate::Snippet {
let text: String = doc
.iter_values_for_field(&self.field_name)
.flat_map(tv::schema::Value::as_text)
.flat_map(|ov| ov.as_str())
.collect::<Vec<&str>>()
.join(" ");

View File

@ -1086,7 +1086,7 @@ class TestQuery(object):
# invalid regex pattern
with pytest.raises(
ValueError, match=r"An invalid argument was passed: 'fish\('"
ValueError, match=r"An invalid argument was passed"
):
Query.regex_query(index.schema, "body", "fish(")
@ -1104,7 +1104,7 @@ class TestQuery(object):
mlt_query = Query.more_like_this_query(doc_address)
assert (
repr(mlt_query)
== "Query(MoreLikeThisQuery { mlt: MoreLikeThis { min_doc_frequency: Some(5), max_doc_frequency: None, min_term_frequency: Some(2), max_query_terms: Some(25), min_word_length: None, max_word_length: None, boost_factor: Some(1.0), stop_words: [] }, target: DocumentAdress(DocAddress { segment_ord: 0, doc_id: 0 }) })"
== "Query(MoreLikeThisQuery { mlt: MoreLikeThis { min_doc_frequency: Some(5), max_doc_frequency: None, min_term_frequency: Some(2), max_query_terms: Some(25), min_word_length: None, max_word_length: None, boost_factor: Some(1.0), stop_words: [] }, target: DocumentAddress(DocAddress { segment_ord: 0, doc_id: 0 }) })"
)
result = index.searcher().search(mlt_query, 10)
assert len(result.hits) == 0
@ -1122,7 +1122,7 @@ class TestQuery(object):
stop_words=["fish"])
assert (
repr(mlt_query)
== "Query(MoreLikeThisQuery { mlt: MoreLikeThis { min_doc_frequency: Some(2), max_doc_frequency: Some(10), min_term_frequency: Some(1), max_query_terms: Some(10), min_word_length: Some(2), max_word_length: Some(20), boost_factor: Some(2.0), stop_words: [\"fish\"] }, target: DocumentAdress(DocAddress { segment_ord: 0, doc_id: 0 }) })"
== "Query(MoreLikeThisQuery { mlt: MoreLikeThis { min_doc_frequency: Some(2), max_doc_frequency: Some(10), min_term_frequency: Some(1), max_query_terms: Some(10), min_word_length: Some(2), max_word_length: Some(20), boost_factor: Some(2.0), stop_words: [\"fish\"] }, target: DocumentAddress(DocAddress { segment_ord: 0, doc_id: 0 }) })"
)
result = index.searcher().search(mlt_query, 10)
assert len(result.hits) > 0
@ -1155,4 +1155,3 @@ class TestQuery(object):
# wrong score type
with pytest.raises(TypeError, match = r"argument 'score': must be real number, not str"):
Query.const_score_query(query, "0.1")