2020-07-15 17:28:27 +00:00
|
|
|
#!/bin/env ruby
|
|
|
|
############################################################################
|
|
|
|
#Word sorter for Pat's dissertation #
|
|
|
|
#More documentation TBD #
|
|
|
|
#Current Usage: #
|
|
|
|
# Scans a file for groups of words and counts the totals #
|
2020-10-24 15:27:47 +00:00
|
|
|
# Accepts the following options: #
|
|
|
|
# --file - the name of the text to sort #
|
|
|
|
# --type - iat or pn #
|
|
|
|
# --bin-file - the name of the bin csv file #
|
2020-07-15 17:28:27 +00:00
|
|
|
# #
|
|
|
|
# The input file is human readable, easy to edit. #
|
|
|
|
# The output file is single-line JSON, use jq to format for reading. #
|
|
|
|
# #
|
|
|
|
#To Do: #
|
|
|
|
# * Output to csv #
|
|
|
|
############################################################################
|
|
|
|
require 'json'
|
|
|
|
require 'pp'
|
2020-07-15 17:41:06 +00:00
|
|
|
require 'csv'
|
2020-07-21 22:56:46 +00:00
|
|
|
require 'optparse'
|
2020-07-15 17:28:27 +00:00
|
|
|
|
|
|
|
#bin_counter expects:
|
|
|
|
# bin - a hash with strings to search for
|
|
|
|
# test - the document text to search through
|
2020-10-24 15:04:04 +00:00
|
|
|
#This method returns a hash containing the strings and their frequency
|
2020-07-15 17:28:27 +00:00
|
|
|
def bin_counter (bin, text)
|
|
|
|
ret = Hash.new
|
|
|
|
bin.each do |word|
|
|
|
|
ret[word.to_sym] = text.scan(word).count.to_s
|
|
|
|
end
|
|
|
|
return ret
|
|
|
|
end
|
|
|
|
|
|
|
|
#count_total expects:
|
|
|
|
# bin - a hash created by bin_counter
|
|
|
|
#This method returns a hash with the total count of all words in a bin
|
|
|
|
def count_total (bin)
|
|
|
|
count = 0
|
|
|
|
bin[:words].each_key do |word|
|
|
|
|
count += bin[:words][word].to_i
|
|
|
|
end
|
|
|
|
return count
|
|
|
|
end
|
|
|
|
|
|
|
|
#write_output expects:
|
|
|
|
# output - a hash containing all of our output
|
|
|
|
#This method converts the output hash to JSON and writes it to output.json
|
2020-07-22 19:17:54 +00:00
|
|
|
def write_output (output, filename)
|
|
|
|
outfile = File.open(filename,'w')
|
2020-07-15 17:28:27 +00:00
|
|
|
outfile.write(output.to_json)
|
|
|
|
outfile.close
|
|
|
|
end
|
|
|
|
|
2020-07-15 22:13:12 +00:00
|
|
|
#strip_text expects:
|
|
|
|
# text - the text we're working on
|
|
|
|
# start - the starting string to search for
|
|
|
|
# fin - the ending string to search for
|
|
|
|
#This method strips out all test before "start" and after "fin"
|
|
|
|
def strip_text (text, start, fin)
|
|
|
|
text.lines(start,chomp: true)[1].lines(fin,chomp: true)[0]
|
|
|
|
end
|
|
|
|
|
2020-10-24 15:27:47 +00:00
|
|
|
#split_text expects:
|
|
|
|
# text - the text we're working on
|
|
|
|
# start - the starting string to search for
|
|
|
|
# fin the ending string to search for
|
|
|
|
# This method returns everything between start and fin and handles multiple sections
|
|
|
|
# Example:
|
|
|
|
# For the following text:
|
|
|
|
# > asdfasdf
|
|
|
|
# > AAAAA
|
|
|
|
# > testing
|
|
|
|
# > abcdefg
|
|
|
|
# > BBBBB
|
|
|
|
# > nothing
|
|
|
|
# > something
|
|
|
|
# > AAAAA
|
|
|
|
# > moo said the cow.
|
|
|
|
# > cluck said the chicken.
|
|
|
|
# > BBBBB
|
|
|
|
# > bark said the dog
|
|
|
|
# Running split_text(text, 'AAAAA', 'BBBBB') would return:
|
|
|
|
# > testing
|
|
|
|
# > abcdefg
|
|
|
|
# > moo said the cow.
|
|
|
|
# > cluck said the chicken.
|
2020-10-24 15:04:04 +00:00
|
|
|
def split_text (text, start, fin)
|
|
|
|
split1 = text.lines(start, chomp: true)[1..-1]
|
|
|
|
split2 = []
|
|
|
|
ret = ""
|
|
|
|
split1.each do |section|
|
|
|
|
split2 += section.lines(fin, chomp: true)
|
|
|
|
end
|
|
|
|
split2.each do |section|
|
|
|
|
ret += section
|
|
|
|
end
|
|
|
|
return ret
|
|
|
|
end
|
|
|
|
|
2020-07-21 22:56:46 +00:00
|
|
|
options = Hash.new
|
|
|
|
OptionParser.new do |opts|
|
|
|
|
opts.banner = 'sorter.rb --options'
|
|
|
|
opts.on("-f", "--file file", "Name of the file to process") do |file|
|
|
|
|
options[:file] = file
|
|
|
|
end
|
2020-10-24 15:04:04 +00:00
|
|
|
opts.on("-t", "--type type", "Type of file. Must be \"iat\" or \"pn\"") do |type|
|
|
|
|
options[:type] = type
|
|
|
|
end
|
2020-10-24 15:13:43 +00:00
|
|
|
opts.on("-b", "--bin-file binfile", "Name of the bin file") do |binfile|
|
|
|
|
options[:binfile] = binfile
|
|
|
|
end
|
2020-07-21 22:56:46 +00:00
|
|
|
end.parse!
|
|
|
|
|
|
|
|
|
2020-10-24 15:13:43 +00:00
|
|
|
csv = CSV.read(options[:binfile])
|
2020-07-21 22:56:46 +00:00
|
|
|
text = File.read(options[:file])
|
2020-10-24 15:04:04 +00:00
|
|
|
text = strip_text(text, 'PLOVEINTAKE', 'PLOVECLOSING') if options[:type] == 'iat'
|
|
|
|
text = split_text(text, 'Narrative:', 'Signatures:') if options[:type] == 'pn'
|
2020-07-15 22:13:12 +00:00
|
|
|
output = Hash.new #Creating the output storage object
|
|
|
|
bins = Hash.new #This hash stores the bins
|
2020-07-22 19:17:54 +00:00
|
|
|
outfile = options[:file] + '-out.json'
|
|
|
|
outfile.slice!('.txt')
|
2020-07-15 22:18:06 +00:00
|
|
|
|
|
|
|
csv.each { |bin| bins[bin[0]] = bin[1..].compact } #turn the csv array into a hash, remove nils
|
2020-07-15 22:13:12 +00:00
|
|
|
|
2020-07-15 17:28:27 +00:00
|
|
|
bins.each_key do |bin_number|
|
|
|
|
key = bin_number.to_sym
|
|
|
|
output[key] = Hash.new
|
|
|
|
output[key][:words] = bin_counter(bins[bin_number], text)
|
|
|
|
output[key][:total] = count_total(output[key])
|
|
|
|
end
|
2020-07-22 19:17:54 +00:00
|
|
|
write_output(output,outfile)
|
2020-07-15 17:28:27 +00:00
|
|
|
|