import spacy
Load spaCy model (you may need to download it first)
nlp = spacy.load('en_core_web_sm')
def adaptive_tokenizer(text):
# Process the input text using spaCy
doc = nlp(text)
# Initialize an empty list to store tokens
tokens = []
# Iterate through the tokens in the processed text
for token in doc:
# Check if the token is a punctuation mark
if token.is_punct:
# If it is, split the text on the punctuation mark and add the parts separately
parts = token.text.split(token.text_with_ws)
for part in parts:
tokens.append(part)
else:
# If it's not punctuation, add the token to the list of tokens
tokens.append(token.text)
return tokens
import spacy
Load spaCy model (you may need to download it first)
nlp = spacy.load('en_core_web_sm')
def adaptive_tokenizer(text):
# Process the input text using spaCy
doc = nlp(text)