Split any text for LLM or RAG or Agent.
- text, semantics, token, markdown, code language, custom, etc.
_ ____ _ _
/ \ _ __ _ _ / ___| |__ _ _ _ __ | | _____ _ __
/ _ \ | '_ \| | | | | | '_ \| | | | '_ \| |/ / _ \ '__|
/ ___ \| | | | |_| | |___| | | | |_| | | | | < __/ |
/_/ \_\_| |_|\__, |\____|_| |_|\__,_|_| |_|_|\_\___|_|
|___/
pip install .
or
pip install -e .from anychunker.text import AnyTextChunker
text = """
# 1111
## 1111.22
dsdsdsds
## 1.4 dsdsdd
dajajfsdfds
###### dsdsdsd
"""## by regex split
model1 = AnyTextChunker(chunk_size = 50, chunk_overlap = 0)
model1.invoke(text)
Document(metadata=DocumentMetadata(created=datetime.datetime(2025, 7, 22, 15, 1, 59, 827280), name='default', topic='default', tag='default', length=70), chunks=[Chunker(metadata={}, chunk_id=0, chunk_size=26, start_pos=1, end_pos=27, content='# 1111\n## 1111.22\ndsdsdsds'), Chunker(metadata={}, chunk_id=1, chunk_size=40, start_pos=29, end_pos=69, content='## 1.4 dsdsdd\ndajajfsdfds\n###### dsdsdsd')])for x in model1.invoke(text).batchIterator(batch_size = 1):
print(x,'\n\n')
ChunkBatcher(batch_index=0, batch_size=1, chunks=[Chunker(metadata={}, chunk_id=0, chunk_size=26, start_pos=1, end_pos=27, content='# 1111\n## 1111.22\ndsdsdsds')], metadata=DocumentMetadata(created=datetime.datetime(2025, 7, 22, 15, 2, 2, 861414), name='default', topic='default', tag='default', length=70), actual_size=1, total_content_length=26, start_chunk_id=0, end_chunk_id=0)
ChunkBatcher(batch_index=1, batch_size=1, chunks=[Chunker(metadata={}, chunk_id=1, chunk_size=40, start_pos=29, end_pos=69, content='## 1.4 dsdsdd\ndajajfsdfds\n###### dsdsdsd')], metadata=DocumentMetadata(created=datetime.datetime(2025, 7, 22, 15, 2, 2, 861414), name='default', topic='default', tag='default', length=70), actual_size=1, total_content_length=40, start_chunk_id=1, end_chunk_id=1) ## by transformer tokenizer
model2 = AnyTextChunker.from_tokenizer("Qwen/Qwen3-8B",chunk_size = 50, chunk_overlap = 0)
model2.invoke(text)
Document(metadata=DocumentMetadata(created=datetime.datetime(2025, 7, 22, 14, 57, 45, 431157), name='default', topic='default', tag='default', length=45), chunks=[Chunker(metadata={}, chunk_id=0, chunk_size=43, start_pos=1, end_pos=44, content='# 1111\n## 1111.22\ndsdsdsds\n\n## 1.4 dsdsdd\ndajajfsdfds\n###### dsdsdsd')])import jieba
def _tokenizer_length(text: str) -> int:
return len(jieba.lcut(text))
model1 = AnyTextChunker(chunk_size = 50, chunk_overlap = 0, length_function = _tokenizer_length,)
model1.invoke(text)## by language
from anychunker.base import Language
model3 = AnyTextChunker.from_language(Language.MARKDOWN,chunk_size = 50, chunk_overlap = 0)
model3.invoke(text)
Document(metadata=DocumentMetadata(created=datetime.datetime(2025, 7, 22, 14, 59, 4, 222919), name='default', topic='default', tag='default', length=70), chunks=[Chunker(metadata={}, chunk_id=0, chunk_size=26, start_pos=1, end_pos=27, content='# 1111\n## 1111.22\ndsdsdsds'), Chunker(metadata={}, chunk_id=1, chunk_size=40, start_pos=29, end_pos=69, content='## 1.4 dsdsdd\ndajajfsdfds\n###### dsdsdsd')])from anychunker.markdown import AnyMarkdownChunker
text = """
# 1111
## 1111.22
dsdsdsds
## 1.4 dsdsdd
dajajfsdfds
###### dsdsdsd
"""
model4 = AnyMarkdownChunker([('#','header1'),('##','Header2')])
model4.invoke(text)
Document(metadata=DocumentMetadata(created=datetime.datetime(2025, 7, 22, 15, 2, 55, 76103), name='default', topic='default', tag='default', length=70), chunks=[Chunker(metadata={'header1': '1111', 'Header2': '1111.22'}, chunk_id=0, chunk_size=8, start_pos=19, end_pos=27, content='dsdsdsds'), Chunker(metadata={'header1': '1111', 'Header2': '1.4 dsdsdd'}, chunk_id=1, chunk_size=26, start_pos=43, end_pos=69, content='dajajfsdfds\n###### dsdsdsd')])from anychunker.semantics import AnySemanticsChunker
from sentence_transformers import SentenceTransformer
# Load the model
model_dir = "Qwen/Qwen3-Embedding-0.6B"
model = SentenceTransformer(model_dir)
def emb_model(sentences):
return model.encode(sentences).tolist()
model5 = AnySemanticsChunker(embedding_model = emb_model)
text = """
# 1111
## 1111.22
dsdsdsds.
## 1.4 dsdsdd
dajajfsdfds.
###### dsdsdsd
"""
model5.invoke(text)
Document(metadata=DocumentMetadata(created=datetime.datetime(2025, 7, 22, 16, 9, 12, 397166), name='default', topic='default', tag='default', length=72), chunks=[Chunker(metadata={}, chunk_id=0, chunk_size=17, start_pos=1, end_pos=18, content='# 1111\n## 1111.22'), Chunker(metadata={}, chunk_id=1, chunk_size=51, start_pos=-1, end_pos=50, content='dsdsdsds.\n## 1.4 dsdsdd\ndajajfsdfds.\n###### dsdsdsd')])# see all functions
docs = model5.invoke(text)
dir(docs)