pat-dissertation/sorter.rb

248 lines
8.4 KiB
Ruby
Raw Normal View History

2020-07-15 17:28:27 +00:00
#!/bin/env ruby
############################################################################
#Word sorter for Pat's dissertation #
#More documentation TBD #
#Current Usage: #
# Scans a file for groups of words and counts the totals #
2020-10-24 15:27:47 +00:00
# Accepts the following options: #
# --file - the name of the text to sort #
# --type - iat or pn #
# --bin-file - the name of the bin csv file #
2020-07-15 17:28:27 +00:00
# #
# The input file is human readable, easy to edit. #
# The output file is single-line JSON, use jq to format for reading. #
# #
#To Do: #
2020-11-21 18:08:58 +00:00
# * add frequency #
2020-07-15 17:28:27 +00:00
############################################################################
require 'json'
require 'pp'
require 'csv'
2020-07-21 22:56:46 +00:00
require 'optparse'
2020-07-15 17:28:27 +00:00
#bin_counter expects:
# bin - a hash with strings to search for
# test - the document text to search through
2020-10-24 15:04:04 +00:00
#This method returns a hash containing the strings and their frequency
2020-07-15 17:28:27 +00:00
def bin_counter (bin, text)
ret = Hash.new
bin.each do |word|
ret[word.to_sym] = text.scan(word).count.to_i
2020-07-15 17:28:27 +00:00
end
return ret
end
#count_total expects:
# bin - a hash created by bin_counter
#This method returns a hash with the total count of all words in a bin
def count_total (bin)
count = 0
bin[:words].each_key do |word|
count += bin[:words][word].to_i
end
return count
end
#write_output expects:
# output - a hash containing all of our output
2020-10-24 17:56:09 +00:00
#This method converts the output hash to JSON and writes it to "output.json"
def write_output_json (output, filename)
2020-11-22 20:29:25 +00:00
pp output
2020-07-22 19:17:54 +00:00
outfile = File.open(filename,'w')
2020-07-15 17:28:27 +00:00
outfile.write(output.to_json)
outfile.close
end
2020-10-24 17:56:09 +00:00
def write_output_csv (output, filename)
CSV.open(filename, 'wb') do |csv|
csv << ["bin", "words", "total"]
2020-10-26 19:44:16 +00:00
output.delete(:filename)
2020-11-21 18:08:58 +00:00
output.delete(:total_words)
2020-10-24 17:56:09 +00:00
output.each_key do |key|
line = []
line.push(key)
output[key].each_key do |sub_key|
line.push(output[key][sub_key])
2020-10-26 19:44:16 +00:00
end
2020-10-24 17:56:09 +00:00
csv << line
end
end
end
2020-07-15 22:13:12 +00:00
#strip_text expects:
# text - the text we're working on
# start - the starting string to search for
# fin - the ending string to search for
#This method strips out all test before "start" and after "fin"
def strip_text (text, start, fin)
text.lines(start,chomp: true)[1].lines(fin,chomp: true)[0]
end
2020-10-24 15:27:47 +00:00
#split_text expects:
# text - the text we're working on
# start - the starting string to search for
# fin the ending string to search for
# This method returns everything between start and fin and handles multiple sections
# Example:
# For the following text:
# > asdfasdf
# > AAAAA
# > testing
# > abcdefg
# > BBBBB
# > nothing
# > something
# > AAAAA
# > moo said the cow.
# > cluck said the chicken.
# > BBBBB
# > bark said the dog
# Running split_text(text, 'AAAAA', 'BBBBB') would return:
# > testing
# > abcdefg
# > moo said the cow.
# > cluck said the chicken.
2020-10-24 15:04:04 +00:00
def split_text (text, start, fin)
split1 = text.lines(start, chomp: true)[1..-1]
split2 = []
ret = ""
split1.each do |section|
split2 += section.lines(fin, chomp: true)
end
split2.each do |section|
ret += section
end
return ret
end
2020-10-26 17:23:24 +00:00
#process_file expects:
# file_name - the name of the file to process
# binfile - the name of the bin file (csv) to use
# type - which type of file are we processing, must be 'pn' or 'iat'
#
#This method is the meat and potatos. Preforms the text stripping, word counting, and creates output files.
def process_file (file_name, binfile, type)
#text = split_text(text, 'Narrative:', 'Signatures:') if type == 'pn'
csv = CSV.read(binfile)
text = File.read(file_name)
bins = Hash.new #This hash stores the bins
csv.each { |bin| bins[bin[0]] = bin[1..].compact } #turn the csv array into a hash, remove nils
outfile = file_name
outfile.slice!('.txt')
if type == 'iat'
text = strip_text(text, 'PLOVEINTAKE', 'PLOVECLOSING') if type == 'iat'
output = Hash.new #Creating the output storage object
#puts outfile
output[:filename] = outfile
output[:total_words] = text.split.size
bins.each_key do |bin_number|
key = bin_number.to_sym
output[key] = Hash.new
output[key][:words] = bin_counter(bins[bin_number], text)
output[key][:total] = count_total(output[key])
2020-11-21 22:32:04 +00:00
output[key][:frequency] = output[key][:total].to_f / output[:total_words].to_f
end
write_output_json(output,outfile + '-out.json')
write_output_csv(output,outfile + '-out.csv')
elsif type == 'pn'
sections = text.lines("Date and time:", chomp: true) #sections is an arrary of each date section from the text
sections.delete_at(0) #we can ignore the first chunk of text
sections.each do |chunk|
timestamp = chunk.lines.delete_if {|line| line == "\r\n"}[0] #pulling out the timestamp
timestamp.tr!('/','-').tr!(':','').tr!(' ','_') #remove slashes and colons from timestamp, replaces spaces with unserscores
timestamp.strip!
output = Hash.new #Creating the output storage object
outfile = file_name + '_' + timestamp
outfile.slice!('.txt')
text = strip_text(chunk, 'Narrative:', 'Signatures:')
output[:filename] = outfile
output[:total_words] = text.split.size
bins.each_key do |bin_number|
key = bin_number.to_sym
output[key] = Hash.new
output[key][:words] = bin_counter(bins[bin_number], text)
output[key][:total] = count_total(output[key])
2020-11-21 22:32:04 +00:00
output[key][:frequency] = output[key][:total].to_f / output[:total_words].to_f
end
write_output_json(output,outfile + '-out.json')
write_output_csv(output,outfile + '-out.csv')
end
end
2020-10-26 17:23:24 +00:00
end
2020-10-26 17:23:24 +00:00
#process_dir expects:
# dir_name - a direcroty containing text files to process
# binfile - the name of the bin file
# type - which type of file are we processing, must be 'pn' or 'iat'
#
#This method will process all .txt files in the supplied directory
def process_dir(dir_name, binfile, type)
Dir.glob(dir_name + '*.txt') do |file_name|
puts "Processing " + file_name
2020-10-26 17:23:24 +00:00
process_file(file_name, binfile, type)
end
end
2020-10-26 19:44:16 +00:00
def generate_master_output(dir_name, binfile)
file=File.open(binfile,"r")
bin_count = file.readlines.size
file.close
bin_header = []
bin_count.times do |num|
num += 1
words_head = "Bin " + num.to_s + " words"
total_head = "Bin " + num.to_s + " total"
2020-11-21 22:32:04 +00:00
freq_head = "Bin " + num.to_s + " frequency"
2020-10-26 19:44:16 +00:00
bin_header.push(words_head)
bin_header.push(total_head)
2020-11-21 22:32:04 +00:00
bin_header.push(freq_head)
2020-10-26 19:44:16 +00:00
end
CSV.open('master.csv', 'wb') do |csv|
header = ["File", "Total Words" ] + bin_header
csv << header
Dir.glob(dir_name + '*.json') do |file_name|
puts 'Getting data from: ' + file_name
2020-10-26 19:44:16 +00:00
csv_row = []
json_file = File.read(file_name)
data_hash = JSON.parse(json_file)
csv_row.push(data_hash["filename"])
2020-11-21 18:08:58 +00:00
csv_row.push(data_hash["total_words"])
2020-10-26 19:44:16 +00:00
data_hash.delete("filename")
2020-11-21 18:08:58 +00:00
data_hash.delete("total_words")
2020-10-26 19:44:16 +00:00
data_hash.each_key do |key|
csv_row.push(data_hash[key]["words"])
csv_row.push(data_hash[key]["total"])
2020-11-21 22:32:04 +00:00
csv_row.push(data_hash[key]["frequency"])
2020-10-26 19:44:16 +00:00
end
csv << csv_row
end
end
end
2020-07-21 22:56:46 +00:00
options = Hash.new
OptionParser.new do |opts|
opts.banner = 'sorter.rb --options'
opts.on("-f", "--file file", "Name of the file to process") do |file|
options[:file] = file
end
2020-10-24 15:04:04 +00:00
opts.on("-t", "--type type", "Type of file. Must be \"iat\" or \"pn\"") do |type|
options[:type] = type
end
2020-10-24 15:13:43 +00:00
opts.on("-b", "--bin-file binfile", "Name of the bin file") do |binfile|
options[:binfile] = binfile
end
2020-10-26 17:23:24 +00:00
opts.on("-d", "--directory dir", "Directory containing text files to process") do |dir|
options[:dir] = dir
end
2020-07-21 22:56:46 +00:00
end.parse!
if options[:file] && options[:dir]
puts "Invalid options, you must either a file or a directoy of files."
elsif options[:file]
process_file(options[:file], options[:binfile], options[:type])
2020-10-26 17:23:24 +00:00
elsif options[:dir]
process_dir(options[:dir], options[:binfile], options[:type])
2020-10-26 19:44:16 +00:00
generate_master_output(options[:dir], options[:binfile])
2020-07-15 17:28:27 +00:00
end