[BUG] ignore trigrams with null terminator byte when constructing full text index

codetheweb · codetheweb · commit 2868d4dfe5fe · 2025-05-01T17:11:22.000-07:00
diff --git a/rust/index/src/fulltext/types.rs b/rust/index/src/fulltext/types.rs
@@ -1,4 +1,5 @@
 use super::util::TokenInstance;
+use super::util::TokenInstanceEncodeError;
 use chroma_blockstore::{BlockfileFlusher, BlockfileReader, BlockfileWriter};
 use chroma_error::{ChromaError, ErrorCodes};
 use futures::StreamExt;
@@ -85,11 +86,16 @@ impl FullTextIndexWriter {
                         .clone()
                         .token_stream(new_document)
                         .process(&mut |token| {
-                            token_instances.push(TokenInstance::encode(
+                            match TokenInstance::encode(
                                 token.text.as_str(),
                                 offset_id,
                                 Some(token.offset_from as u32),
-                            ));
+                            ) {
+                                Ok(encoded) => token_instances.push(encoded),
+                                Err(TokenInstanceEncodeError::NullTerminator) => {
+                                    // ignore
+                                }
+                            }
                         });
                 }
 
@@ -104,29 +110,46 @@ impl FullTextIndexWriter {
                         .clone()
                         .token_stream(old_document)
                         .process(&mut |token| {
-                            trigrams_to_delete.insert(TokenInstance::encode(
+                            match TokenInstance::encode(
                                 token.text.as_str(),
                                 offset_id,
-                                None,
-                            ));
+                                Some(token.offset_from as u32),
+                            ) {
+                                Ok(encoded) => {
+                                    trigrams_to_delete.insert(encoded);
+                                }
+                                Err(TokenInstanceEncodeError::NullTerminator) => {
+                                    // ignore
+                                }
+                            }
                         });
 
                     // Add doc
                     self.tokenizer
                         .clone()
                         .token_stream(new_document)
                         .process(&mut |token| {
-                            trigrams_to_delete.remove(&TokenInstance::encode(
-                                token.text.as_str(),
-                                offset_id,
-                                None,
-                            ));
-
-                            token_instances.push(TokenInstance::encode(
+                            match TokenInstance::encode(token.text.as_str(), offset_id, None) {
+                                Ok(encoded) => {
+                                    trigrams_to_delete.remove(&encoded);
+                                }
+                                Err(TokenInstanceEncodeError::NullTerminator) => {
+                                    // ignore
+                                }
+                            }
+
+                            match TokenInstance::encode(
                                 token.text.as_str(),
                                 offset_id,
                                 Some(token.offset_from as u32),
-                            ));
+                            ) {
+                                Ok(encoded) => {
+                                    token_instances.push(encoded);
+                                }
+                                Err(TokenInstanceEncodeError::NullTerminator) => {
+                                    // ignore
+                                }
+                            }
                         });
 
                     token_instances.extend(trigrams_to_delete.into_iter());
@@ -143,11 +166,18 @@ impl FullTextIndexWriter {
                         .clone()
                         .token_stream(old_document)
                         .process(&mut |token| {
-                            trigrams_to_delete.insert(TokenInstance::encode(
+                            match TokenInstance::encode(
                                 token.text.as_str(),
                                 offset_id,
-                                None,
-                            ));
+                                Some(token.offset_from as u32),
+                            ) {
+                                Ok(encoded) => {
+                                    trigrams_to_delete.insert(encoded);
+                                }
+                                Err(TokenInstanceEncodeError::NullTerminator) => {
+                                    // ignore
+                                }
+                            }
                         });
 
                     token_instances.extend(trigrams_to_delete.into_iter());
@@ -909,6 +939,44 @@ mod tests {
         assert_eq!(res.len(), 3);
     }
 
+    #[tokio::test]
+    async fn test_document_with_null_terminators() {
+        let tmp_dir = tempdir().unwrap();
+        let storage = Storage::Local(LocalStorage::new(tmp_dir.path().to_str().unwrap()));
+        let block_cache = new_cache_for_test();
+        let root_cache = new_cache_for_test();
+        let provider = BlockfileProvider::new_arrow(storage, 1024 * 1024, block_cache, root_cache);
+        let pl_blockfile_writer = provider
+            .write::<u32, Vec<u32>>(BlockfileWriterOptions::default().ordered_mutations())
+            .await
+            .unwrap();
+        let pl_blockfile_id = pl_blockfile_writer.id();
+
+        let tokenizer = NgramTokenizer::new(3, 3, false).unwrap();
+        let mut index_writer = FullTextIndexWriter::new(pl_blockfile_writer, tokenizer.clone());
+
+        index_writer
+            .handle_batch([DocumentMutation::Create {
+                offset_id: 1,
+                new_document: "hello \0 wor\0ld",
+            }])
+            .unwrap();
+
+        index_writer.write_to_blockfiles().await.unwrap();
+        let flusher = index_writer.commit().await.unwrap();
+        flusher.flush().await.unwrap();
+
+        let pl_blockfile_reader = provider
+            .read::<u32, &[u32]>(&pl_blockfile_id)
+            .await
+            .unwrap();
+        let tokenizer = NgramTokenizer::new(3, 3, false).unwrap();
+        let index_reader = FullTextIndexReader::new(pl_blockfile_reader, tokenizer);
+
+        let res = index_reader.search("hello").await.unwrap();
+        assert_eq!(res, RoaringBitmap::from([1]));
+    }
+
     #[tokio::test]
     async fn test_update_document() {
         let tmp_dir = tempdir().unwrap();
diff --git a/rust/index/src/fulltext/util.rs b/rust/index/src/fulltext/util.rs
@@ -1,3 +1,5 @@
+use thiserror::Error;
+
 /// A token instance is a unique value containing a trigram, an offset ID, and optionally a position within a document.
 /// These three attributes are packed into a single u128 value:
 /// - The trigram is a 63-bit value, packed into the top 64 bits.
@@ -8,15 +10,21 @@
 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
 pub struct TokenInstance(u128);
 
-// Unicode characters only use 21 bits, so we can encode a trigram in 21 * 3 = 63 bits (a u64).
+/// Unicode characters only use 21 bits, so we can encode a trigram in 21 * 3 = 63 bits (a u64).
+/// Returns None if the string contains a null terminator.
 #[inline(always)]
-fn pack_trigram(s: &str) -> u64 {
+fn pack_trigram(s: &str) -> Option<u64> {
     let mut u = 0u64;
     for (i, c) in s.chars().take(3).enumerate() {
+        if c == '\0' {
+            return None;
+        }
+
         let shift = (2 - i) * 21;
         u |= (c as u64) << shift;
     }
-    u
+
+    Some(u)
 }
 
 fn encode_utf8_unchecked(c: u32, buf: &mut [u8]) -> usize {
@@ -74,16 +82,29 @@ fn unpack_trigram(u: u64) -> String {
     s
 }
 
+#[derive(Debug, Error)]
+pub enum TokenInstanceEncodeError {
+    #[error("Token contains null terminator")]
+    NullTerminator,
+}
+
 impl TokenInstance {
     pub const MAX: Self = Self(u128::MAX);
 
     #[inline(always)]
-    pub fn encode(token: &str, offset_id: u32, position: Option<u32>) -> Self {
-        TokenInstance(
-            ((pack_trigram(token) as u128) << 64)
-                | ((offset_id as u128) << 32)
-                | (position.map(|o| o | (1 << 31)).unwrap_or(0) as u128),
-        )
+    pub fn encode(
+        token: &str,
+        offset_id: u32,
+        position: Option<u32>,
+    ) -> Result<Self, TokenInstanceEncodeError> {
+        match pack_trigram(token) {
+            Some(packed) => Ok(TokenInstance(
+                ((packed as u128) << 64)
+                    | ((offset_id as u128) << 32)
+                    | (position.map(|o| o | (1 << 31)).unwrap_or(0) as u128),
+            )),
+            None => Err(TokenInstanceEncodeError::NullTerminator),
+        }
     }
 
     #[inline(always)]
@@ -121,7 +142,7 @@ mod tests {
     proptest! {
       #[test]
       fn test_pack_unpack_trigram(token in "\\PC{3}", offset_id in 0..u32::MAX, position in proptest::option::of((0..u32::MAX).prop_map(|v| v >> 1))) {
-        let encoded = TokenInstance::encode(&token, offset_id, position);
+        let encoded = TokenInstance::encode(&token, offset_id, position).unwrap();
         let decoded_token = encoded.get_token();
         let decoded_offset_id = encoded.get_offset_id();
         let decoded_position = encoded.get_position();
@@ -133,8 +154,8 @@ mod tests {
 
       #[test]
       fn test_omit_position(token in "\\PC{3}", offset_id in 0..u32::MAX, position1 in proptest::option::of(0..u32::MAX), position2 in proptest::option::of(0..u32::MAX)) {
-        let encoded1 = TokenInstance::encode(&token, offset_id, position1);
-        let encoded2 = TokenInstance::encode(&token, offset_id, position2);
+        let encoded1 = TokenInstance::encode(&token, offset_id, position1).unwrap();
+        let encoded2 = TokenInstance::encode(&token, offset_id, position2).unwrap();
 
         assert_eq!(encoded1.omit_position(), encoded2.omit_position(), "Omitting position should make two token instances equal");
         assert_eq!(encoded1.omit_position().get_token(), encoded1.get_token(), "Omitting position should not change the token");