Topic Modeling is an unsupervised machine learning technique that automatically identifies abstract topics from a collection of documents or text data. It helps in discovering hidden thematic structures in large text corpora.
from sklearn.feature_extraction.text import CountVectorizer
documents = [
"Cats are small furry animals.",
"Dogs are loyal and friendly pets.",
"Birds can fly and have feathers.",
"Fish live in water and swim.",
"Lions are big cats that live in the wild.",
]
vectorizer = CountVectorizer(stop_words='english')
dtm = vectorizer.fit_transform(documents)
print("Feature names:", vectorizer.get_feature_names_out())
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=2, random_state=42)
lda.fit(dtm)
def display_topics(model, feature_names, no_top_words):
for topic_idx, topic in enumerate(model.components_):
print(f"Topic {topic_idx}: ", end='')
print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
display_topics(lda, vectorizer.get_feature_names_out(), 3)
new_doc = ["Fishes are aquatic animals."]
new_dtm = vectorizer.transform(new_doc)
topic_distribution = lda.transform(new_dtm)
print("Topic distribution:", topic_distribution)
lda_tuned = LatentDirichletAllocation(
n_components=3,
learning_method='batch',
max_iter=20,
random_state=0
)
lda_tuned.fit(dtm)
display_topics(lda_tuned, vectorizer.get_feature_names_out(), 3)
from sklearn.decomposition import NMF
nmf = NMF(n_components=2, random_state=1)
nmf.fit(dtm)
for idx, topic in enumerate(nmf.components_):
print(f"NMF Topic {idx}: ", " ".join([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-4:-1]]))
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
def preprocess(text):
tokens = word_tokenize(text.lower())
return " ".join(
lemmatizer.lemmatize(word)
for word in tokens
if word.isalpha() and word not in stop_words
)
preprocessed_docs = [preprocess(doc) for doc in documents]
vectorizer = CountVectorizer()
dtm = vectorizer.fit_transform(preprocessed_docs)
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
# Sample corpus
documents = [
"Apple and banana are fruits.",
"I love to eat banana and mango.",
"Fruits like apple and mango are delicious.",
"Python and Java are popular programming languages.",
"I write code in Python and sometimes in Java.",
"Programming with Python is fun."
]
# Vectorize the text data
vectorizer = CountVectorizer(stop_words='english')
dtm = vectorizer.fit_transform(documents)
# Fit LDA model
lda = LatentDirichletAllocation(n_components=2, random_state=42)
lda.fit(dtm)
# Plot top words per topic
def plot_top_words(model, feature_names, n_top_words=10):
for topic_idx, topic in enumerate(model.components_):
top_features_indices = topic.argsort()[:-n_top_words - 1:-1]
top_features = [feature_names[i] for i in top_features_indices]
top_weights = topic[top_features_indices]
plt.figure(figsize=(8, 5))
plt.barh(top_features[::-1], top_weights[::-1])
plt.xlabel("Word Importance")
plt.title(f"Top {n_top_words} Words in Topic #{topic_idx + 1}")
plt.tight_layout()
plt.show()
# Get feature (vocabulary) names
feature_names = vectorizer.get_feature_names_out()
plot_top_words(lda, feature_names, n_top_words=7)
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
# Topic distributions from the LDA model
topic_distributions = lda.transform(dtm) # shape: (n_documents, n_topics)
# Dimensionality reduction using t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=5, max_iter=1000)
tsne_results = tsne.fit_transform(topic_distributions)
# Dominant topic per document
dominant_topic = np.argmax(topic_distributions, axis=1)
# Plot the 2D t-SNE projection
plt.figure(figsize=(8, 6))
scatter = plt.scatter(tsne_results[:, 0], tsne_results[:, 1], c=dominant_topic, cmap='tab10')
plt.colorbar(scatter, label='Dominant Topic')
plt.title("Document Clusters via LDA Topic Distributions (t-SNE)")
plt.xlabel("Dimension 1")
plt.ylabel("Dimension 2")
plt.tight_layout()
plt.show()