import nltk # Import NLTK import re # Import Regular Expressions package - we'll use this to divide a text into meaningf

Post by **answerhappygod** » Mon May 02, 2022 12:16 pm

import nltk # Import NLTK
import re # Import Regular Expressions package - we'll use this to divide a text into meaningful segments
import matplotlib.pyplot as plt # we'll use matplotlib for graphs
import string
nltk.download('stopwords')
stopword_list = nltk.corpus.stopwords.words('english')
print(string.punctuation)
stopword_list.extend(string.punctuation)
custom_stop_list = ['"', '"'"''", "``", '``','--', '\'s', '', '.\"']
stopword_list.extend(custom_stop_list)
print(stopword_list)
with open('austen_pride.txt, 'r') as austen_pride_d:
d = f.read()
# It's relatively simple to divide a text into regular segments -
# use some simple math and string slices.
text = austen_pride_raw
text_length = len(text)
text_segments = [] # create an empty list to receive the segments of text you'll create
number_of_segments = 100 # specify the number of segments you want
# loop over the length of text, divide it into sections, and add them to the text_segments list
for i in range(number_of_segments):
segment_size = text_length/number_of_segments # Determine the length of each segment
segment_start = int(segment_size * i) # The starting point of the slice increases by segment length for each iteration of the loop
segment_end = int(segment_size * (i +1)) # The ending point is one segment length greater than the startingn point
text_segments.append(text[segment_start:segment_end]) # Append the slice to the segments list
# Verify that this worked as intended:
print('number of segments in this text: ' + str(len(text_segments)))
print('length of segment 1: ' + str(len(text_segments[0])))
print('length of segment 2: ' + str(len(text_segments[1])))
# Split the segments into tokens using an nltk tokenize() method,
# or use a string.split() method if you prefer
# Tokenize each segment and assign to a new list
tokenized_text = [nltk.wordpunct_tokenize(segment) for segment in text_segments]
clean_text = [] # create an empty list to receive the cleaned-up tokens
for wordlist in tokenized_text:
clean_wordlist = []
for word in wordlist:
if word not in stopword_list and word.isalpha(): # clear out punctuation and stopwords
clean_wordlist.append(word.lower()) # cast words in lower case
clean_text.append(clean_wordlist)
# Get a word frequency distribution for each chapter and assign to a new list:
pride_seg_freq = [nltk.FreqDist(segment) for segment in clean_text]
# Use the most_common method to find the top 3 words of each segment, along with their word counts
# Pass any number as an argument
pride_seg_most_common = [segment.most_common(3) for segment in pride_seg_freq]
# print the top 3 words for the first 10 chapters
# This confirms that the code worked as expected and shows you what the list looks like:
print(pride_seg_most_common[:10])
# Recall that the FreqDist method returns a data structure that behaves like a Python dictionary.
# Index the dictionary by key to get the count for a specific word.
# This creates a list of word counts for every chapter
word_counts = [nltk.FreqDist(segment)['bingley'] for segment in clean_text]
print(word_counts)
plt.plot(word_counts)