Compare commits

..

3 Commits

Author SHA1 Message Date
Jeff Yates e2aaa7a2b5 turned frequency into a string for json 2020-11-22 15:24:29 -05:00
Jeff Yates fa39e4d040 fixed NaN error when dividing by 0 2020-11-22 15:18:37 -05:00
Jeff Yates ddb4003e66 added threading to file processing 2020-11-21 19:11:29 -05:00
2 changed files with 15 additions and 21 deletions

3
.gitignore vendored
View File

@ -1,3 +0,0 @@
test-data/*
master.csv
bins.csv

View File

@ -47,14 +47,9 @@ end
# output - a hash containing all of our output # output - a hash containing all of our output
#This method converts the output hash to JSON and writes it to "output.json" #This method converts the output hash to JSON and writes it to "output.json"
def write_output_json (output, filename) def write_output_json (output, filename)
begin
outfile = File.open(filename,'w') outfile = File.open(filename,'w')
outfile.write(output.to_json) outfile.write(output.to_json)
outfile.close outfile.close
rescue
pp output
abort
end
end end
def write_output_csv (output, filename) def write_output_csv (output, filename)
@ -152,13 +147,8 @@ def process_file (file_name, binfile, type)
sections = text.lines("Date and time:", chomp: true) #sections is an arrary of each date section from the text sections = text.lines("Date and time:", chomp: true) #sections is an arrary of each date section from the text
sections.delete_at(0) #we can ignore the first chunk of text sections.delete_at(0) #we can ignore the first chunk of text
sections.each do |chunk| sections.each do |chunk|
begin
timestamp = chunk.lines.delete_if {|line| line == "\r\n"}[0] #pulling out the timestamp timestamp = chunk.lines.delete_if {|line| line == "\r\n"}[0] #pulling out the timestamp
timestamp.tr!('/','-').tr!(':','').tr!(' ','_') #remove slashes and colons from timestamp, replaces spaces with unserscores timestamp.tr!('/','-').tr!(':','').tr!(' ','_') #remove slashes and colons from timestamp, replaces spaces with underscores
rescue
pp timestamp
abort
end
timestamp.strip! timestamp.strip!
output = Hash.new #Creating the output storage object output = Hash.new #Creating the output storage object
outfile = file_name + '_' + timestamp outfile = file_name + '_' + timestamp
@ -171,7 +161,10 @@ def process_file (file_name, binfile, type)
output[key] = Hash.new output[key] = Hash.new
output[key][:words] = bin_counter(bins[bin_number], text) output[key][:words] = bin_counter(bins[bin_number], text)
output[key][:total] = count_total(output[key]) output[key][:total] = count_total(output[key])
output[key][:frequency] = output[key][:total].to_f / output[:total_words].to_f output[key][:frequency] = 0
#output[key][:frequency] = output[key][:total].to_f / output[:total_words].to_f if output[[:total_words] != 0
freq = output[key][:total].to_f / output[:total_words].to_f
output[key][:frequency] = freq.to_s
end end
write_output_json(output,outfile + '-out.json') write_output_json(output,outfile + '-out.json')
write_output_csv(output,outfile + '-out.csv') write_output_csv(output,outfile + '-out.csv')
@ -186,11 +179,15 @@ end
# #
#This method will process all .txt files in the supplied directory #This method will process all .txt files in the supplied directory
def process_dir(dir_name, binfile, type) def process_dir(dir_name, binfile, type)
threads = []
Dir.glob(dir_name + '*.txt') do |file_name| Dir.glob(dir_name + '*.txt') do |file_name|
threads << Thread.new do
puts "Processing " + file_name puts "Processing " + file_name
process_file(file_name, binfile, type) process_file(file_name, binfile, type)
end end
end end
threads.each { |thr| thr.join }
end
def generate_master_output(dir_name, binfile) def generate_master_output(dir_name, binfile)
file=File.open(binfile,"r") file=File.open(binfile,"r")