import nltk # Import NLTK import re # Import Regular Expressions package - we'll use this to divide a text into meaningf

Business, Finance, Economics, Accounting, Operations Management, Computer Science, Electrical Engineering, Mechanical Engineering, Civil Engineering, Chemical Engineering, Algebra, Precalculus, Statistics and Probabilty, Advanced Math, Physics, Chemistry, Biology, Nursing, Psychology, Certifications, Tests, Prep, and more.
Post Reply
answerhappygod
Site Admin
Posts: 899604
Joined: Mon Aug 02, 2021 8:13 am

import nltk # Import NLTK import re # Import Regular Expressions package - we'll use this to divide a text into meaningf

Post by answerhappygod »

import nltk # Import NLTK
import re # Import Regular Expressions package - we'll use this to divide a text into meaningful segments
import matplotlib.pyplot as plt # we'll use matplotlib for graphs
import string
nltk.download('stopwords')
stopword_list = nltk.corpus.stopwords.words('english')
print(string.punctuation)
stopword_list.extend(string.punctuation)
custom_stop_list = ['"', '"'"''", "``", '``','--', '\'s', '', '.\"']
stopword_list.extend(custom_stop_list)
print(stopword_list)
with open('austen_pride.txt, 'r') as austen_pride_d:
d = f.read()
# It's relatively simple to divide a text into regular segments -
# use some simple math and string slices.
text = austen_pride_raw
text_length = len(text)
text_segments = [] # create an empty list to receive the segments of text you'll create
number_of_segments = 100 # specify the number of segments you want
# loop over the length of text, divide it into sections, and add them to the text_segments list
for i in range(number_of_segments):
segment_size = text_length/number_of_segments # Determine the length of each segment
segment_start = int(segment_size * i) # The starting point of the slice increases by segment length for each iteration of the loop
segment_end = int(segment_size * (i +1)) # The ending point is one segment length greater than the startingn point
text_segments.append(text[segment_start:segment_end]) # Append the slice to the segments list
# Verify that this worked as intended:
print('number of segments in this text: ' + str(len(text_segments)))
print('length of segment 1: ' + str(len(text_segments[0])))
print('length of segment 2: ' + str(len(text_segments[1])))
# Split the segments into tokens using an nltk tokenize() method,
# or use a string.split() method if you prefer
# Tokenize each segment and assign to a new list
tokenized_text = [nltk.wordpunct_tokenize(segment) for segment in text_segments]
clean_text = [] # create an empty list to receive the cleaned-up tokens
for wordlist in tokenized_text:
clean_wordlist = []
for word in wordlist:
if word not in stopword_list and word.isalpha(): # clear out punctuation and stopwords
clean_wordlist.append(word.lower()) # cast words in lower case
clean_text.append(clean_wordlist)
# Get a word frequency distribution for each chapter and assign to a new list:
pride_seg_freq = [nltk.FreqDist(segment) for segment in clean_text]
# Use the most_common method to find the top 3 words of each segment, along with their word counts
# Pass any number as an argument
pride_seg_most_common = [segment.most_common(3) for segment in pride_seg_freq]
# print the top 3 words for the first 10 chapters
# This confirms that the code worked as expected and shows you what the list looks like:
print(pride_seg_most_common[:10])
# Recall that the FreqDist method returns a data structure that behaves like a Python dictionary.
# Index the dictionary by key to get the count for a specific word.
# This creates a list of word counts for every chapter
word_counts = [nltk.FreqDist(segment)['bingley'] for segment in clean_text]
print(word_counts)
plt.plot(word_counts)
Join a community of subject matter experts. Register for FREE to view solutions, replies, and use search function. Request answer by replying!
Post Reply