Part of Speech tagging assigns word classes (nouns, verbs, adjectives, etc.) to each word in a sentence. It is a foundational step in many NLP tasks, helping understand the syntactic structure and meaning of text.
NLTK uses the Penn Treebank tagset, e.g., NN
for noun, VB
for verb, JJ
for adjective, etc.
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
text = "The quick brown fox jumps over the lazy dog."
tokens = word_tokenize(text)
tagged = pos_tag(tokens)
print(tagged)
sentences = [
"I love Python programming.",
"She is reading a book."
]
for sent in sentences:
tokens = word_tokenize(sent)
print(pos_tag(tokens))
nouns = [word for word, pos in tagged if pos.startswith('NN')]
print("Nouns:", nouns)
verbs = [word for word, pos in tagged if pos.startswith('VB')]
print("Verbs:", verbs)
adjectives = [word for word, pos in tagged if pos == 'JJ']
print("Adjectives:", adjectives)
text = "Barack Obama was born in Hawaii."
tokens = word_tokenize(text)
tagged = pos_tag(tokens)
proper_nouns = [word for word, pos in tagged if pos == 'NNP']
print("Proper Nouns (Candidates for NER):", proper_nouns)
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("The quick brown fox jumps over the lazy dog.")
for token in doc:
print(f"{token.text}: {token.pos_}")
from collections import Counter
import matplotlib.pyplot as plt
tag_counts = Counter(tag for word, tag in tagged)
print(tag_counts)
plt.bar(tag_counts.keys(), tag_counts.values())
plt.title("POS Tag Frequency")
plt.xlabel("POS Tags")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("The quick brown fox jumps over the lazy dog.")
for token in doc:
print(f"{token.text} --> {token.dep_} --> {token.head.text}")
import spacy
nlp_fr = spacy.load("fr_core_news_sm")
doc = nlp_fr("Le renard brun rapide saute par-dessus le chien paresseux.")
for token in doc:
print(f"{token.text}: {token.pos_}")