pat-dissertation/sorter.rb

81 lines
3.0 KiB
Ruby
Raw Normal View History

2020-07-15 17:28:27 +00:00
#!/bin/env ruby
############################################################################
#Word sorter for Pat's dissertation #
#More documentation TBD #
#Current Usage: #
# Scans a file for groups of words and counts the totals #
# Input file: ./tester.txt #
# Wroud group file: ./bins.json #
# Output file: ./output.json #
# #
# The input file is human readable, easy to edit. #
# The output file is single-line JSON, use jq to format for reading. #
# #
#To Do: #
# * use STDIN to accept file to scan #
# * Support scaning multiple files #
# * Output to csv #
# * Strip out header and footer text #
############################################################################
require 'json'
require 'pp'
require 'csv'
2020-07-15 17:28:27 +00:00
#bin_counter expects:
# bin - a hash with strings to search for
# test - the document text to search through
#This method returns a hash containg the strings and their frequency
def bin_counter (bin, text)
ret = Hash.new
bin.each do |word|
ret[word.to_sym] = text.scan(word).count.to_s
end
return ret
end
#count_total expects:
# bin - a hash created by bin_counter
#This method returns a hash with the total count of all words in a bin
def count_total (bin)
count = 0
bin[:words].each_key do |word|
count += bin[:words][word].to_i
end
return count
end
#write_output expects:
# output - a hash containing all of our output
#This method converts the output hash to JSON and writes it to output.json
def write_output (output)
outfile = File.open('./output.json','w')
outfile.write(output.to_json)
outfile.close
end
2020-07-15 22:13:12 +00:00
#strip_text expects:
# text - the text we're working on
# start - the starting string to search for
# fin - the ending string to search for
#This method strips out all test before "start" and after "fin"
def strip_text (text, start, fin)
text.lines(start,chomp: true)[1].lines(fin,chomp: true)[0]
end
csv = CSV.read('./bins.csv')
text = File.read('tester.txt')
text = strip_text(text,'PLOVEINTAKE','PLOVECLOSING')
output = Hash.new #Creating the output storage object
bins = Hash.new #This hash stores the bins
2020-07-15 22:18:06 +00:00
csv.each { |bin| bins[bin[0]] = bin[1..].compact } #turn the csv array into a hash, remove nils
2020-07-15 22:13:12 +00:00
2020-07-15 17:28:27 +00:00
bins.each_key do |bin_number|
key = bin_number.to_sym
output[key] = Hash.new
output[key][:words] = bin_counter(bins[bin_number], text)
output[key][:total] = count_total(output[key])
end
write_output(output)