#Importing all relevant packages
import gensim
import numpy as np
import pandas as pd
import nltk
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess
from sklearn.datasets import fetch_20newsgroups
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer

dataset = fetch_20newsgroups(subset='train')['data']

print(len(dataset)) #the length of the data
print(type(dataset)) # the type of variable the data is stored in 
print(dataset[:1]) # the first instance of the content within the data

11314
<class 'list'>
["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"]

#Creating a dataframe from the data imported 
full_train = pd.DataFrame() 
full_train['text'] = dataset
full_train.head()

documents = full_train

# Pre-processing steps 
def lemmatization(text):
   
    lemmatizer = WordNetLemmatizer() 
    return lemmatizer.lemmatize(text)

def pre_process_data(text):

    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3: #removing stopwords 
            result.append(lemmatization(token))
    return result

#If the following packages are not already downloaded, the following lines are needed 
#nltk.download('wordnet')  
#nltk.download('omw-1.4')

preprocessed_data = documents['text'].map(pre_process_data)

print(preprocessed_data)

0        [lerxst, thing, subject, nntp, posting, host, ...
1        [guykuo, carson, washington, subject, clock, p...
2        [twillis, purdue, thomas, willis, subject, que...
3        [jgreen, amber, green, subject, weitek, organi...
4        [head, harvard, jonathan, mcdowell, subject, s...
                               ...                        
11309    [zisfein, factory, zisfein, subject, migraine,...
11310    [ebodin, pearl, tuft, subject, screen, death, ...
11311    [westes, netcom, estes, subject, mounting, coo...
11312    [steve, hcrlgw, steven, collins, subject, sphe...
11313    [gunning, caltech, kevin, gunning, subject, st...
Name: text, Length: 11314, dtype: object

dictionary = gensim.corpora.Dictionary(preprocessed_data)
count = 0
for w, n in dictionary.iteritems():
  print(w,n)
  count += 1
  if count >10:
      break

0 addition
1 body
2 bricklin
3 brought
4 bumper
5 called
6 college
7 door
8 early
9 engine
10 enlighten

dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=100000)

bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_data]
example = bow_corpus[90]

for w in range(len(example)):
  print("Word {} (\"{}\") appears {} time.".format(example[w][0], 
                                               dictionary[example[w][0]], 
example[w][1]))

Word 13 ("know") appears 1 time.
Word 16 ("looking") appears 1 time.
Word 30 ("thanks") appears 1 time.
Word 32 ("university") appears 1 time.
Word 98 ("email") appears 2 time.
Word 167 ("chip") appears 1 time.
Word 178 ("information") appears 1 time.
Word 282 ("idea") appears 1 time.
Word 296 ("million") appears 1 time.
Word 374 ("available") appears 1 time.
Word 413 ("psuvm") appears 1 time.
Word 428 ("state") appears 1 time.
Word 443 ("help") appears 2 time.
Word 477 ("related") appears 1 time.
Word 488 ("work") appears 1 time.
Word 643 ("alot") appears 1 time.
Word 1020 ("important") appears 1 time.
Word 1315 ("worked") appears 1 time.
Word 1387 ("game") appears 1 time.
Word 1795 ("willing") appears 1 time.
Word 3112 ("peter") appears 1 time.
Word 3439 ("atari") appears 4 time.
Word 3440 ("out") appears 1 time.
Word 3441 ("penn") appears 1 time.
Word 3442 ("processor") appears 2 time.
Word 3443 ("schematic") appears 1 time.

lda_model = gensim.models.ldamodel.LdaModel(corpus=bow_corpus,
                                       id2word=dictionary,
                                       num_topics=10, 
                                       passes=10)

#pip install pyLDAvis

import pyLDAvis.gensim_models
import pyLDAvis
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, bow_corpus, dictionary) #giving the needed values to  generate the topics
LDAvis_prepared

/usr/local/lib/python3.7/dist-packages/past/types/oldstr.py:5: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated since Python 3.3,and in 3.9 it will stop working
  from collections import Iterable
/usr/local/lib/python3.7/dist-packages/pyLDAvis/_prepare.py:247: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only
  by='saliency', ascending=False).head(R).drop('saliency', 1)

from matplotlib import pyplot as plt
import matplotlib.colors as mcolors
from collections import Counter
%matplotlib inline
topics = lda_model.show_topics(formatted=False, num_topics = 10)
data_flat = [w for w_list in bow_corpus for w in w_list]
counter = Counter(data_flat)

out = []
for i, topic in topics:
    for word, weight in topic:
        out.append([word, i , weight, counter[word]])

df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])        

# Plot Word Count and Weights of Topic Keywords
fig, axes = plt.subplots(5, 2, figsize=(16,15), sharey=True, dpi=160) #setting the number of topics visualised 
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
    ax.bar(x='word', height=3000, data=df.loc[df.topic_id==i, :], color=cols[i], width=0.5, alpha=0.3, label='Word Count')
    ax_twin = ax.twinx()
    ax_twin.bar(x='word', height="importance", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.2, label='Weights')
    ax.set_ylabel('Word Count', color=cols[i])
    ax_twin.set_ylim(0, 0.3); ax.set_ylim(0, 3500)
    ax.set_title('Topic: ' + str(i), color=cols[i], fontsize=16)
    ax.tick_params(axis='y', left=False)
    ax.set_xticklabels(df.loc[df.topic_id==i, 'word'], rotation=30, horizontalalignment= 'center')
    ax.legend(loc='upper left'); ax_twin.legend(loc='upper right')

fig.tight_layout(w_pad=2)    
fig.suptitle('Word Count and Importance of Topic Keywords', fontsize=40, y=1.05)    
plt.show()

# Compute Coherence Score using C_V
coherence_model_lda = CoherenceModel(model=lda_model, texts=preprocessed_data, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

Coherence Score:  0.5626086935123448

# Compute Coherence Score using UMass
coherence_model_lda = CoherenceModel(model=lda_model, texts=preprocessed_data, dictionary=dictionary, coherence="u_mass")
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

Coherence Score:  -1.921655256775838

What is Topic Modeling?¶

What does Latent LDA stand for?¶

How does LDA work?¶

Implementation¶

Step 0: Loading the the data and relevant packages¶

Step 1: Reviewing and preparing the data¶

Step 2: Input preparation for topic model¶

Step 3: Parameters and training the model¶

Parameters for this implementation¶

Additional parameters that can be used!¶

Step 4: Results¶

Discussion¶

Step 5: Evaluation¶

Discussion¶

Closing Notes¶