#!/bin/env ruby
############################################################################
#Word sorter for Pat's dissertation                                        #
#More documentation TBD                                                    #
#Current Usage:                                                            #
# Scans a file for groups of words and counts the totals                   #
# Accepts the following options:                                           #
#   --file      - the name of the text to sort                             #
#   --type      - iat or pn                                                #
#   --bin-file  - the name of the bin csv file                             #
#                                                                          #
# The input file is human readable, easy to edit.                          #
# The output file is single-line JSON, use jq to format for reading.       #
#                                                                          #
#To Do:                                                                    #
# * Output to csv                                                          #
############################################################################
require 'json'
require 'pp'
require 'csv'
require 'optparse'

#bin_counter expects:
# bin - a hash with strings to search for
# test - the document text to search through
#This method returns a hash containing the strings and their frequency
def bin_counter (bin, text)
  ret = Hash.new
  bin.each do |word|
    ret[word.to_sym] = text.scan(word).count.to_s
  end
  return ret
end

#count_total expects:
# bin - a hash created by bin_counter
#This method returns a hash with the total count of all words in a bin
def count_total (bin)
  count = 0
  bin[:words].each_key do |word|
    count += bin[:words][word].to_i
  end
  return count
end

#write_output expects:
# output - a hash containing all of our output
#This method converts the output hash to JSON and writes it to output.json
def write_output (output, filename)
  outfile = File.open(filename,'w')
  outfile.write(output.to_json)
  outfile.close
end

#strip_text expects:
# text - the text we're working on
# start - the starting string to search for
# fin - the ending string to search for
#This method strips out all test before "start" and after "fin"
def strip_text (text, start, fin)
  text.lines(start,chomp: true)[1].lines(fin,chomp: true)[0]
end

#split_text expects:
# text - the text we're working on
#  start - the starting string to search for
#  fin the ending string to search for
# This method returns everything between start and fin and handles multiple sections
# Example:
# For the following text:
#  > asdfasdf
#  > AAAAA
#  > testing
#  > abcdefg
#  > BBBBB
#  > nothing
#  > something
#  > AAAAA
#  > moo said the cow.
#  > cluck said the chicken.
#  > BBBBB
#  > bark said the dog
# Running split_text(text, 'AAAAA', 'BBBBB') would return:
#  > testing
#  > abcdefg
#  > moo said the cow.
#  > cluck said the chicken.
def split_text (text, start, fin)
  split1 = text.lines(start, chomp: true)[1..-1]
  split2 = []
  ret = ""
  split1.each do |section|
    split2 += section.lines(fin, chomp: true)
  end
  split2.each do |section|
    ret += section
  end
  return ret
end

options = Hash.new
OptionParser.new do |opts|
  opts.banner = 'sorter.rb --options'
  opts.on("-f", "--file file", "Name of the file to process") do |file|
    options[:file] = file
  end
  opts.on("-t", "--type type", "Type of file. Must be \"iat\" or \"pn\"") do |type|
    options[:type] = type
  end
  opts.on("-b", "--bin-file binfile", "Name of the bin file") do |binfile|
    options[:binfile] = binfile
  end
end.parse!


csv = CSV.read(options[:binfile])
text = File.read(options[:file])
text = strip_text(text, 'PLOVEINTAKE', 'PLOVECLOSING') if options[:type] == 'iat'
text = split_text(text, 'Narrative:', 'Signatures:') if options[:type] == 'pn'
output = Hash.new #Creating the output storage object
bins = Hash.new #This hash stores the bins
outfile = options[:file] + '-out.json'
outfile.slice!('.txt')

csv.each { |bin| bins[bin[0]] = bin[1..].compact } #turn the csv array into a hash, remove nils

bins.each_key do |bin_number|
  key = bin_number.to_sym
  output[key] = Hash.new
  output[key][:words] = bin_counter(bins[bin_number], text)
  output[key][:total] = count_total(output[key])
end
write_output(output,outfile)