Topic Modeling

Topic Modeling is an unsupervised machine learning technique that automatically identifies abstract topics from a collection of documents or text data. It helps in discovering hidden thematic structures in large text corpora.

Why Use Topic Modeling?

Summarizes large volumes of text data
Improves information retrieval and content recommendation
Supports exploratory data analysis and text mining
Facilitates clustering and classification of documents

10 Examples of Topic Modeling

Example 1: Convert Text to Document-Term Matrix


from sklearn.feature_extraction.text import CountVectorizer

documents = [
    "Cats are small furry animals.",
    "Dogs are loyal and friendly pets.",
    "Birds can fly and have feathers.",
    "Fish live in water and swim.",
    "Lions are big cats that live in the wild.",
]

vectorizer = CountVectorizer(stop_words='english')
dtm = vectorizer.fit_transform(documents)
print("Feature names:", vectorizer.get_feature_names_out())

Example 2: Initialize LDA Model


from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=2, random_state=42)

Example 3: Fit LDA Model


lda.fit(dtm)

Example 4: Display Topics with Top Words


def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}: ", end='')
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

display_topics(lda, vectorizer.get_feature_names_out(), 3)

Example 5: Transform New Document to Topic Distribution


new_doc = ["Fishes are aquatic animals."]
new_dtm = vectorizer.transform(new_doc)
topic_distribution = lda.transform(new_dtm)
print("Topic distribution:", topic_distribution)

Example 6: Tune Number of Topics and Hyperparameters


lda_tuned = LatentDirichletAllocation(
    n_components=3,
    learning_method='batch',
    max_iter=20,
    random_state=0
)
lda_tuned.fit(dtm)
display_topics(lda_tuned, vectorizer.get_feature_names_out(), 3)

Example 7: Use NMF as Alternative to LDA


from sklearn.decomposition import NMF

nmf = NMF(n_components=2, random_state=1)
nmf.fit(dtm)

for idx, topic in enumerate(nmf.components_):
    print(f"NMF Topic {idx}: ", " ".join([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-4:-1]]))

Example 8: Preprocessing with Tokenization & Lemmatization


import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(text.lower())
    return " ".join(
        lemmatizer.lemmatize(word)
        for word in tokens
        if word.isalpha() and word not in stop_words
    )

preprocessed_docs = [preprocess(doc) for doc in documents]
vectorizer = CountVectorizer()
dtm = vectorizer.fit_transform(preprocessed_docs)

Example 9: Visualize Topics Using matplotlib


import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

# Sample corpus
documents = [
    "Apple and banana are fruits.",
    "I love to eat banana and mango.",
    "Fruits like apple and mango are delicious.",
    "Python and Java are popular programming languages.",
    "I write code in Python and sometimes in Java.",
    "Programming with Python is fun."
]

# Vectorize the text data
vectorizer = CountVectorizer(stop_words='english')
dtm = vectorizer.fit_transform(documents)

# Fit LDA model
lda = LatentDirichletAllocation(n_components=2, random_state=42)
lda.fit(dtm)

# Plot top words per topic
def plot_top_words(model, feature_names, n_top_words=10):
    for topic_idx, topic in enumerate(model.components_):
        top_features_indices = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_indices]
        top_weights = topic[top_features_indices]

        plt.figure(figsize=(8, 5))
        plt.barh(top_features[::-1], top_weights[::-1])
        plt.xlabel("Word Importance")
        plt.title(f"Top {n_top_words} Words in Topic #{topic_idx + 1}")
        plt.tight_layout()
        plt.show()

# Get feature (vocabulary) names
feature_names = vectorizer.get_feature_names_out()
plot_top_words(lda, feature_names, n_top_words=7)

Example 10: Use Topic Modeling for Document Clustering


import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np

# Topic distributions from the LDA model
topic_distributions = lda.transform(dtm)  # shape: (n_documents, n_topics)

# Dimensionality reduction using t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=5, max_iter=1000)
tsne_results = tsne.fit_transform(topic_distributions)

# Dominant topic per document
dominant_topic = np.argmax(topic_distributions, axis=1)

# Plot the 2D t-SNE projection
plt.figure(figsize=(8, 6))
scatter = plt.scatter(tsne_results[:, 0], tsne_results[:, 1], c=dominant_topic, cmap='tab10')
plt.colorbar(scatter, label='Dominant Topic')
plt.title("Document Clusters via LDA Topic Distributions (t-SNE)")
plt.xlabel("Dimension 1")
plt.ylabel("Dimension 2")
plt.tight_layout()
plt.show()

Topic Modeling in NLP