Stopwords are commonly used words in a language that usually do not add significant meaning to a sentence. Examples include "is", "and", "the", "in". Removing stopwords is a common preprocessing step in NLP to focus on meaningful words.
Languages have their own sets of stopwords. For English, the NLTK library provides a comprehensive list.
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
text = "This is a sample sentence, showing off the stopwords filtration."
words = word_tokenize(text)
filtered_sentence = [w for w in words if w.lower() not in stop_words]
print("Original:", words)
print("Filtered:", filtered_sentence)
custom_stopwords = stop_words.union({"sample", "showing"})
filtered_custom = [w for w in words if w.lower() not in custom_stopwords]
print("Filtered with custom stopwords:", filtered_custom)
stopword_count = sum(1 for w in words if w.lower() in stop_words)
print("Number of stopwords:", stopword_count)
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = ["This is a sample sentence showing off the stopwords filtration."]
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(corpus)
print("TF-IDF feature names:", vectorizer.get_feature_names_out())
spanish_stopwords = set(stopwords.words('spanish'))
print("Spanish stopwords sample:", list(spanish_stopwords)[:10])
print("'the' is stopword?", 'the' in stop_words)
print("'Python' is stopword?", 'Python' in stop_words)
large_text = ["This is the first sentence.", "Another sentence with more words."]
filtered_corpus = [
[word for word in word_tokenize(sent) if word.lower() not in stop_words]
for sent in large_text
]
print("Filtered corpus:", filtered_corpus)
from wordcloud import WordCloud
import matplotlib.pyplot as plt
non_stopwords = [w for w in words if w.lower() not in stop_words]
wordcloud = WordCloud().generate(" ".join(non_stopwords))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
from textblob import TextBlob
text_with_stopwords = "I am not happy with this horrible experience"
text_without_stopwords = "happy horrible experience"
print("Sentiment with stopwords:", TextBlob(text_with_stopwords).sentiment)
print("Sentiment without stopwords:", TextBlob(text_without_stopwords).sentiment)
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
docs = [
"The quick brown fox jumps over the lazy dog",
"Never jump over the lazy dog quickly"
]
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(docs)
lda = LatentDirichletAllocation(n_components=2, random_state=42)
lda.fit(X)
print("Topic word distribution:")
for idx, topic in enumerate(lda.components_):
print(f"Topic {idx}:",
[vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-3:]])