Tokenizing Text Data

Import Libraries

1
2
3
import nltk
from nltk.corpus import stopwords
from collections import Counter

Set Stopwords

1
2
3
4
5
6
stop_words = stopwords.words("english")
stop_words.append(',')
stop_words.append('.')
stop_words.append('’')
stop_words.append('”')
stop_words.append('—')

Open Text Data

1
2
file = open('movie_review.txt', 'r', encoding="utf-8")
lines = file.readlines()

Tokenize

1
2
3
4
5
6
tokens = []
for line in lines:
    tokenized = nltk.word_tokenize(line)
    for token in tokenized:
        if token.lower() not in stop_words:
            tokens.append(token)

Counting Nouns

POS Tagging

1
2
3
4
5
6
tags = nltk.pos_tag(tokens)

word_list = []
for word, tag in tags:
    if tag.startswith('N'):
        word_list.append(word.lower())

Counting Nouns

1
2
counts = Counter(word_list)
print(counts.most_common(10))

Output

1
[('movie', 406), ('batman', 303), ('film', 284), ('joker', 219), ('dark', 136), ('ledger', 131), ('knight', 124), ('time', 112), ('heath', 110), ('performance', 87)]

Counting Adjectives

POS Tagging

1
2
3
4
5
6
tags = nltk.pos_tag(tokens)

word_list = []
for word, tag in tags:
    if tag.startswith('J'):
        word_list.append(word.lower())

Counting Adjectives

1
2
counts = Counter(word_list)
print(counts.most_common(10))

Output

1
[('good', 141), ('best', 102), ('great', 78), ('many', 54), ('much', 52), ('comic', 43), ('real', 29), ('bad', 28), ('little', 26), ('new', 25)]

Counting Verbs

POS Tagging

1
2
3
4
5
6
tags = nltk.pos_tag(tokens)

word_list = []
for word, tag in tags:
    if tag.startswith('V'):
        word_list.append(word.lower())

Counting Verbs

1
2
counts = Counter(word_list)
print(counts.most_common(10))

Output

1
[('see', 59), ('get', 54), ('made', 49), ('think', 46), ('seen', 45), ('make', 45), ('say', 41), ("'ve", 37), ("'m", 32), ('going', 31)]

Visualizing Tokens

Import Libraries

1
2
import matplotlib.pyplot as plt
import re

정규표현식으로 토큰 분류

1
2
3
4
5
6
7
8
tokens = []

for line in lines:
    tokenized = nltk.word_tokenize(line)
    for token in tokenized:
        if token.lower() not in stop_words:
            if re.match('^[a-zA-Z]+', token):
                tokens.append(token)

Visualizing Tokens

1
2
3
4
plt.figure(figsize=(10, 3)) 
plt.title('Top 25 Words',fontsize=30)

corpus.plot(25)

Top 25 Words Chart

token-chart


Similar Words

1
2
print('Similar words : ')
corpus.similar('batman')

Output

1
2
Similar words : 
superhero film action movie character better iconic seen acting actor heath performance modern difficult villain second end good come best

Collocation

1
2
print('Collocation')
corpus.collocation()

Output

1
2
Collocation
Dark Knight; Heath Ledger; Christian Bale; comic book; Harvey Dent; Christopher Nolan; Bruce Wayne; Aaron Eckhart; Morgan Freeman; Gary Oldman; Batman Begins; Two Face; Gotham City; Maggie Gyllenhaal; Rachel Dawes; Michael Caine; special effect; Tim Burton; Jack Nicholson; dark knight