@@ -56,12 +56,12 @@ def test_large_chunks(self, chunker):
56
56
assert chunker .tokenizer .count_tokens (chunk .value ) <= MAX_TOKENS
57
57
58
58
assert chunks [0 ].value .startswith ("foo-0!" )
59
- assert chunks [1 ].value .startswith ("foo-11 !" )
60
- assert chunks [2 ].value .startswith ("foo-17 !" )
59
+ assert chunks [1 ].value .startswith ("foo-7 !" )
60
+ assert chunks [2 ].value .startswith ("foo-13 !" )
61
61
assert chunks [3 ].value .startswith ("foo-0." )
62
62
63
- assert chunks [0 ].value .endswith ("! foo-10 !" )
64
- assert chunks [1 ].value .endswith ("! foo-16 !" )
63
+ assert chunks [0 ].value .endswith ("! foo-6 !" )
64
+ assert chunks [1 ].value .endswith ("! foo-12 !" )
65
65
assert chunks [2 ].value .endswith ("! foo-24!" )
66
66
assert chunks [3 ].value .endswith (". foo-11." )
67
67
@@ -92,19 +92,19 @@ def test_separators(self, chunker):
92
92
assert chunker .tokenizer .count_tokens (chunk .value ) <= MAX_TOKENS
93
93
94
94
assert chunks [0 ].value .startswith ("foo-0!" )
95
- assert chunks [1 ].value .startswith ("foo-11 !" )
96
- assert chunks [2 ].value .startswith ("foo-17 !" )
95
+ assert chunks [1 ].value .startswith ("foo-7 !" )
96
+ assert chunks [2 ].value .startswith ("foo-13 !" )
97
97
assert chunks [3 ].value .startswith ("foo-0." )
98
98
assert chunks [4 ].value .startswith ("foo-0?" )
99
- assert chunks [5 ].value .startswith ("foo-9 ?" )
99
+ assert chunks [5 ].value .startswith ("foo-7 ?" )
100
100
assert chunks [6 ].value .startswith ("foo-0" )
101
101
assert chunks [7 ].value .startswith ("foo-8" )
102
102
103
- assert chunks [0 ].value .endswith ("! foo-10 !" )
104
- assert chunks [1 ].value .endswith ("! foo-16 !" )
103
+ assert chunks [0 ].value .endswith ("! foo-6 !" )
104
+ assert chunks [1 ].value .endswith ("! foo-12 !" )
105
105
assert chunks [2 ].value .endswith ("! foo-24!" )
106
106
assert chunks [3 ].value .endswith (". foo-11." )
107
- assert chunks [4 ].value .endswith ("? foo-8 ?" )
107
+ assert chunks [4 ].value .endswith ("? foo-6 ?" )
108
108
assert chunks [5 ].value .endswith ("? foo-12?" )
109
109
assert chunks [6 ].value .endswith (" foo-7" )
110
110
assert chunks [7 ].value .endswith (" foo-16" )
@@ -138,3 +138,15 @@ def test_artifact_reference(self, chunker):
138
138
139
139
for chunk in chunks :
140
140
assert chunk .reference is None
141
+
142
+ def test_midpoint_index_empty_subchunks (self , chunker ):
143
+ # This tests that a midpoint index is correctly found when there are some empty subchunks
144
+ # Previously ["foo", '', "bar", 'baz'] would be token counted as 'foobarbaz' rather than 'foo bar baz'
145
+ # when calculating the midpoint index.
146
+ # https://github.com/griptape-ai/griptape/issues/1796
147
+ chunker .max_tokens = 3
148
+
149
+ assert len (chunker .chunk ("foo bar baz" )) == 1
150
+ assert len (chunker .chunk ("foo bar baz " )) == 2
151
+
152
+ assert len (chunker .chunk ("foo bar baz" )) == 2
0 commit comments