pat-dissertation/sorter.rb

#!/bin/env ruby
############################################################################
#Word sorter for Pat's dissertation                                        #
#More documentation TBD                                                    #
#Current Usage:                                                            #
# Scans a file for groups of words and counts the totals                   #
# Input file:       ./tester.txt                                           #
# Wroud group file: ./bins.json                                            #
# Output file:      ./output.json                                          #
#                                                                          #
# The input file is human readable, easy to edit.                          #
# The output file is single-line JSON, use jq to format for reading.       #
#                                                                          #
#To Do:                                                                    #
# * use STDIN to accept file to scan                                       #
# * Support scaning multiple files                                         #
# * Output to csv                                                          #
# * Strip out header and footer text                                       #
############################################################################
require 'json'
require 'pp'
require 'csv'
require 'optparse'

#bin_counter expects:
# bin - a hash with strings to search for
# test - the document text to search through
#This method returns a hash containg the strings and their frequency
def bin_counter (bin, text)
  ret = Hash.new
  bin.each do |word|
    ret[word.to_sym] = text.scan(word).count.to_s
  end
  return ret
end

#count_total expects:
# bin - a hash created by bin_counter
#This method returns a hash with the total count of all words in a bin
def count_total (bin)
  count = 0
  bin[:words].each_key do |word|
    count += bin[:words][word].to_i
  end
  return count
end

#write_output expects:
# output - a hash containing all of our output
#This method converts the output hash to JSON and writes it to output.json
def write_output (output, filename)
  outfile = File.open(filename,'w')
  outfile.write(output.to_json)
  outfile.close
end

#strip_text expects:
# text - the text we're working on
# start - the starting string to search for
# fin - the ending string to search for
#This method strips out all test before "start" and after "fin"
def strip_text (text, start, fin)
  text.lines(start,chomp: true)[1].lines(fin,chomp: true)[0]
end

options = Hash.new
OptionParser.new do |opts|
  opts.banner = 'sorter.rb --options'
  opts.on("-f", "--file file", "Name of the file to process") do |file|
    options[:file] = file
  end
end.parse!


csv = CSV.read('./bins.csv')
text = File.read(options[:file])
text = strip_text(text,'PLOVEINTAKE','PLOVECLOSING')
output = Hash.new #Creating the output storage object
bins = Hash.new #This hash stores the bins
outfile = options[:file] + '-out.json'
outfile.slice!('.txt')

csv.each { |bin| bins[bin[0]] = bin[1..].compact } #turn the csv array into a hash, remove nils

bins.each_key do |bin_number|
  key = bin_number.to_sym
  output[key] = Hash.new
  output[key][:words] = bin_counter(bins[bin_number], text)
  output[key][:total] = count_total(output[key])
end
write_output(output,outfile)
moving existing code into repo 2020-07-15 17:28:27 +00:00			`#!/bin/env ruby`
			`############################################################################`
			`#Word sorter for Pat's dissertation #`
			`#More documentation TBD #`
			`#Current Usage: #`
			`# Scans a file for groups of words and counts the totals #`
			`# Input file: ./tester.txt #`
			`# Wroud group file: ./bins.json #`
			`# Output file: ./output.json #`
			`# #`
			`# The input file is human readable, easy to edit. #`
			`# The output file is single-line JSON, use jq to format for reading. #`
			`# #`
			`#To Do: #`
			`# * use STDIN to accept file to scan #`
			`# * Support scaning multiple files #`
			`# * Output to csv #`
			`# * Strip out header and footer text #`
			`############################################################################`
			`require 'json'`
			`require 'pp'`
updated sorter to accept csv input instead of json 2020-07-15 17:41:06 +00:00			`require 'csv'`
accept file from stdin 2020-07-21 22:56:46 +00:00			`require 'optparse'`
moving existing code into repo 2020-07-15 17:28:27 +00:00
			`#bin_counter expects:`
			`# bin - a hash with strings to search for`
			`# test - the document text to search through`
			`#This method returns a hash containg the strings and their frequency`
			`def bin_counter (bin, text)`
			`ret = Hash.new`
			`bin.each do \|word\|`
			`ret[word.to_sym] = text.scan(word).count.to_s`
			`end`
			`return ret`
			`end`

			`#count_total expects:`
			`# bin - a hash created by bin_counter`
			`#This method returns a hash with the total count of all words in a bin`
			`def count_total (bin)`
			`count = 0`
			`bin[:words].each_key do \|word\|`
			`count += bin[:words][word].to_i`
			`end`
			`return count`
			`end`

			`#write_output expects:`
			`# output - a hash containing all of our output`
			`#This method converts the output hash to JSON and writes it to output.json`
changed output file 2020-07-22 19:17:54 +00:00			`def write_output (output, filename)`
			`outfile = File.open(filename,'w')`
moving existing code into repo 2020-07-15 17:28:27 +00:00			`outfile.write(output.to_json)`
			`outfile.close`
			`end`

added strip_text method 2020-07-15 22:13:12 +00:00			`#strip_text expects:`
			`# text - the text we're working on`
			`# start - the starting string to search for`
			`# fin - the ending string to search for`
			`#This method strips out all test before "start" and after "fin"`
			`def strip_text (text, start, fin)`
			`text.lines(start,chomp: true)[1].lines(fin,chomp: true)[0]`
			`end`

accept file from stdin 2020-07-21 22:56:46 +00:00			`options = Hash.new`
			`OptionParser.new do \|opts\|`
			`opts.banner = 'sorter.rb --options'`
			`opts.on("-f", "--file file", "Name of the file to process") do \|file\|`
			`options[:file] = file`
			`end`
			`end.parse!`


added strip_text method 2020-07-15 22:13:12 +00:00			`csv = CSV.read('./bins.csv')`
accept file from stdin 2020-07-21 22:56:46 +00:00			`text = File.read(options[:file])`
added strip_text method 2020-07-15 22:13:12 +00:00			`text = strip_text(text,'PLOVEINTAKE','PLOVECLOSING')`
			`output = Hash.new #Creating the output storage object`
			`bins = Hash.new #This hash stores the bins`
changed output file 2020-07-22 19:17:54 +00:00			`outfile = options[:file] + '-out.json'`
			`outfile.slice!('.txt')`
removed unneeded to_sym 2020-07-15 22:18:06 +00:00
			`csv.each { \|bin\| bins[bin[0]] = bin[1..].compact } #turn the csv array into a hash, remove nils`
added strip_text method 2020-07-15 22:13:12 +00:00
moving existing code into repo 2020-07-15 17:28:27 +00:00			`bins.each_key do \|bin_number\|`
			`key = bin_number.to_sym`
			`output[key] = Hash.new`
			`output[key][:words] = bin_counter(bins[bin_number], text)`
			`output[key][:total] = count_total(output[key])`
			`end`
changed output file 2020-07-22 19:17:54 +00:00			`write_output(output,outfile)`
moving existing code into repo 2020-07-15 17:28:27 +00:00