Word tokenization is the process of splitting sentences or text into individual words or tokens. It is essential for tasks like parsing, text analysis, and language modeling.
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer
text = "Hello world! NLP's tokenization is essential for text analysis."
# Example 1: Basic word tokenization
tokens = word_tokenize(text)
print(tokens)
# Example 2: Tokenizing contractions correctly
text2 = "Don't hesitate to ask questions."
print(word_tokenize(text2))
# Example 3: Using RegexpTokenizer to keep only words
tokenizer = RegexpTokenizer(r'\w+')
print(tokenizer.tokenize(text))
# Example 4: Using SpaCy tokenizer
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
print([token.text for token in doc])
# Example 5: Tokenize and convert to lowercase
tokens_lower = [t.lower() for t in tokens]
print(tokens_lower)
# Example 6: Tokenize words including hyphens
text3 = "State-of-the-art NLP techniques are evolving."
print(word_tokenize(text3))
# Example 7: Tokenizing text with numbers and symbols
text4 = "Python 3.8.5 was released in 2020!"
print(word_tokenize(text4))
# Example 8: Removing punctuation tokens
tokens_no_punc = [t for t in tokens if t.isalpha()]
print(tokens_no_punc)
# Example 9: Tokenizing multilingual text
text_de = "Hallo Welt! Wie geht's?"
print(word_tokenize(text_de, language='german'))
# Example 10: Tokenizing emoji and special characters (basic)
text5 = "I love NLP 😊 #AI"
print(word_tokenize(text5))