File size: 916 Bytes
31b6e27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import pytest

from preprocessing import num_tokens_from_string, split_in_chunks


def test_split_in_chunks():
    text = "hello world " * 5000  # creates a string with 10000 tokens
    chunks = split_in_chunks(text, max_tokens=8191, encoding_name="cl100k_base")
    assert len(chunks) == 2
    assert num_tokens_from_string(chunks[0], "cl100k_base") <= 8191
    assert num_tokens_from_string(chunks[1], "cl100k_base") <= 8191

    for chunk_size in [100, 1000, 3000, 5000]:
        chunks = split_in_chunks(
            text, max_tokens=chunk_size, encoding_name="cl100k_base"
        )
        for chunk in chunks:
            assert num_tokens_from_string(chunk, "cl100k_base") <= chunk_size

    text = "This is a short text."
    chunks = split_in_chunks(text, max_tokens=8191, encoding_name="cl100k_base")
    assert len(chunks) == 1
    assert chunks[0] == text


if __name__ == "__main__":
    pytest.main()