pat-dissertation/sorter.rb

#!/bin/env ruby
############################################################################
#Word sorter for Pat's dissertation                                        #
#More documentation TBD                                                    #
#Current Usage:                                                            #
# Scans a file for groups of words and counts the totals                   #
# Accepts the following options:                                           #
#   --file      - the name of the text to sort                             #
#   --type      - iat or pn                                                #
#   --bin-file  - the name of the bin csv file                             #
#                                                                          #
# The input file is human readable, easy to edit.                          #
# The output file is single-line JSON, use jq to format for reading.       #
#                                                                          #
#To Do:                                                                    #
# * add frequency                                                          #
############################################################################
require 'json'
require 'pp'
require 'csv'
require 'optparse'

#bin_counter expects:
# bin - a hash with strings to search for
# test - the document text to search through
#This method returns a hash containing the strings and their frequency
def bin_counter (bin, text)
  ret = Hash.new
  bin.each do |word|
    ret[word.to_sym] = text.scan(word).count.to_i
  end
  return ret
end

#count_total expects:
# bin - a hash created by bin_counter
#This method returns a hash with the total count of all words in a bin
def count_total (bin)
  count = 0
  bin[:words].each_key do |word|
    count += bin[:words][word].to_i
  end
  return count
end

#write_output expects:
# output - a hash containing all of our output
#This method converts the output hash to JSON and writes it to "output.json"
def write_output_json (output, filename)
  pp output
  outfile = File.open(filename,'w')
  outfile.write(output.to_json)
  outfile.close
end

def write_output_csv (output, filename)
  CSV.open(filename, 'wb') do |csv|
    csv << ["bin", "words", "total"]
    output.delete(:filename)
    output.delete(:total_words)
    output.each_key do |key|
      line = []
      line.push(key)
      output[key].each_key do |sub_key|
        line.push(output[key][sub_key])
      end 
      csv << line
    end
  end
end

#strip_text expects:
# text - the text we're working on
# start - the starting string to search for
# fin - the ending string to search for
#This method strips out all test before "start" and after "fin"
def strip_text (text, start, fin)
  text.lines(start,chomp: true)[1].lines(fin,chomp: true)[0]
end

#split_text expects:
# text - the text we're working on
#  start - the starting string to search for
#  fin the ending string to search for
# This method returns everything between start and fin and handles multiple sections
# Example:
# For the following text:
#  > asdfasdf
#  > AAAAA
#  > testing
#  > abcdefg
#  > BBBBB
#  > nothing
#  > something
#  > AAAAA
#  > moo said the cow.
#  > cluck said the chicken.
#  > BBBBB
#  > bark said the dog
# Running split_text(text, 'AAAAA', 'BBBBB') would return:
#  > testing
#  > abcdefg
#  > moo said the cow.
#  > cluck said the chicken.
def split_text (text, start, fin)
  split1 = text.lines(start, chomp: true)[1..-1]
  split2 = []
  ret = ""
  split1.each do |section|
    split2 += section.lines(fin, chomp: true)
  end
  split2.each do |section|
    ret += section
  end
  return ret
end

#process_file expects:
# file_name - the name of the file to process
# binfile - the name of the bin file (csv) to use
# type - which type of file are we processing, must be 'pn' or 'iat'
#
#This method is the meat and potatos. Preforms the text stripping, word counting, and creates output files.
def process_file (file_name, binfile, type)
  #text = split_text(text, 'Narrative:', 'Signatures:') if type == 'pn'
  csv = CSV.read(binfile)
  text = File.read(file_name)
  bins = Hash.new #This hash stores the bins
  csv.each { |bin| bins[bin[0]] = bin[1..].compact } #turn the csv array into a hash, remove nils
  outfile = file_name
  outfile.slice!('.txt')
  if type == 'iat'
    text = strip_text(text, 'PLOVEINTAKE', 'PLOVECLOSING') if type == 'iat'
    output = Hash.new #Creating the output storage object
  #puts outfile
    output[:filename] = outfile
    output[:total_words] = text.split.size
    bins.each_key do |bin_number|
      key = bin_number.to_sym
      output[key] = Hash.new
      output[key][:words] = bin_counter(bins[bin_number], text)
      output[key][:total] = count_total(output[key])
      output[key][:frequency] = output[key][:total].to_f / output[:total_words].to_f
    end
    write_output_json(output,outfile + '-out.json')
    write_output_csv(output,outfile + '-out.csv')
  elsif type == 'pn'
    sections = text.lines("Date and time:", chomp: true) #sections is an arrary of each date section from the text
    sections.delete_at(0) #we can ignore the first chunk of text
    sections.each do |chunk|
      timestamp = chunk.lines.delete_if {|line| line == "\r\n"}[0] #pulling out the timestamp
      timestamp.tr!('/','-').tr!(':','').tr!(' ','_') #remove slashes and colons from timestamp, replaces spaces with unserscores
      timestamp.strip!
      output = Hash.new #Creating the output storage object
      outfile = file_name + '_' + timestamp
      outfile.slice!('.txt')
      text = strip_text(chunk, 'Narrative:', 'Signatures:')
      output[:filename] = outfile
      output[:total_words] = text.split.size
      bins.each_key do |bin_number|
        key = bin_number.to_sym
        output[key] = Hash.new
        output[key][:words] = bin_counter(bins[bin_number], text)
        output[key][:total] = count_total(output[key])
        output[key][:frequency] = output[key][:total].to_f / output[:total_words].to_f
      end
      write_output_json(output,outfile + '-out.json')
      write_output_csv(output,outfile + '-out.csv')
    end
  end
end

#process_dir expects:
# dir_name - a direcroty containing text files to process
# binfile - the name of the bin file
# type - which type of file are we processing, must be 'pn' or 'iat'
#
#This method will process all .txt files in the supplied directory
def process_dir(dir_name, binfile, type)
  Dir.glob(dir_name + '*.txt') do |file_name|
    puts "Processing " + file_name
    process_file(file_name, binfile, type)
  end
end

def generate_master_output(dir_name, binfile)
  file=File.open(binfile,"r")
  bin_count = file.readlines.size
  file.close
  bin_header = []
  bin_count.times do |num|
    num += 1
    words_head = "Bin " + num.to_s + " words"
    total_head = "Bin " + num.to_s + " total"
    freq_head = "Bin " + num.to_s + " frequency"
    bin_header.push(words_head)
    bin_header.push(total_head)
    bin_header.push(freq_head)
  end
  CSV.open('master.csv', 'wb') do |csv|
    header = ["File", "Total Words" ] + bin_header
    csv << header
    Dir.glob(dir_name + '*.json') do |file_name|
      puts 'Getting data from: ' + file_name
      csv_row = []
      json_file = File.read(file_name)
      data_hash = JSON.parse(json_file)
      csv_row.push(data_hash["filename"])
      csv_row.push(data_hash["total_words"])
      data_hash.delete("filename")
      data_hash.delete("total_words")
      data_hash.each_key do |key|
        csv_row.push(data_hash[key]["words"])
        csv_row.push(data_hash[key]["total"])
        csv_row.push(data_hash[key]["frequency"])
      end
      csv << csv_row
    end
  end
end

options = Hash.new
OptionParser.new do |opts|
  opts.banner = 'sorter.rb --options'
  opts.on("-f", "--file file", "Name of the file to process") do |file|
    options[:file] = file
  end
  opts.on("-t", "--type type", "Type of file. Must be \"iat\" or \"pn\"") do |type|
    options[:type] = type
  end
  opts.on("-b", "--bin-file binfile", "Name of the bin file") do |binfile|
    options[:binfile] = binfile
  end
  opts.on("-d", "--directory dir", "Directory containing text files to process") do |dir|
    options[:dir] = dir
  end
end.parse!

if options[:file] && options[:dir]
  puts "Invalid options, you must either a file or a directoy of files."
elsif options[:file]
  process_file(options[:file], options[:binfile], options[:type])
elsif options[:dir]
  process_dir(options[:dir], options[:binfile], options[:type])
  generate_master_output(options[:dir], options[:binfile])
end
moving existing code into repo 2020-07-15 17:28:27 +00:00			`#!/bin/env ruby`
			`############################################################################`
			`#Word sorter for Pat's dissertation #`
			`#More documentation TBD #`
			`#Current Usage: #`
			`# Scans a file for groups of words and counts the totals #`
cleaned up comments 2020-10-24 15:27:47 +00:00			`# Accepts the following options: #`
			`# --file - the name of the text to sort #`
			`# --type - iat or pn #`
			`# --bin-file - the name of the bin csv file #`
moving existing code into repo 2020-07-15 17:28:27 +00:00			`# #`
			`# The input file is human readable, easy to edit. #`
			`# The output file is single-line JSON, use jq to format for reading. #`
			`# #`
			`#To Do: #`
changed total word count behaviour 2020-11-21 18:08:58 +00:00			`# * add frequency #`
moving existing code into repo 2020-07-15 17:28:27 +00:00			`############################################################################`
			`require 'json'`
			`require 'pp'`
updated sorter to accept csv input instead of json 2020-07-15 17:41:06 +00:00			`require 'csv'`
accept file from stdin 2020-07-21 22:56:46 +00:00			`require 'optparse'`
moving existing code into repo 2020-07-15 17:28:27 +00:00
			`#bin_counter expects:`
			`# bin - a hash with strings to search for`
			`# test - the document text to search through`
added split_text() and --type option 2020-10-24 15:04:04 +00:00			`#This method returns a hash containing the strings and their frequency`
moving existing code into repo 2020-07-15 17:28:27 +00:00			`def bin_counter (bin, text)`
			`ret = Hash.new`
			`bin.each do \|word\|`
changed word count hash/json from string to integer 2020-10-24 19:10:42 +00:00			`ret[word.to_sym] = text.scan(word).count.to_i`
moving existing code into repo 2020-07-15 17:28:27 +00:00			`end`
			`return ret`
			`end`

			`#count_total expects:`
			`# bin - a hash created by bin_counter`
			`#This method returns a hash with the total count of all words in a bin`
			`def count_total (bin)`
			`count = 0`
			`bin[:words].each_key do \|word\|`
			`count += bin[:words][word].to_i`
			`end`
			`return count`
			`end`

			`#write_output expects:`
			`# output - a hash containing all of our output`
added csv output 2020-10-24 17:56:09 +00:00			`#This method converts the output hash to JSON and writes it to "output.json"`
			`def write_output_json (output, filename)`
print output hash for debugging 2020-11-22 20:29:25 +00:00			`pp output`
changed output file 2020-07-22 19:17:54 +00:00			`outfile = File.open(filename,'w')`
moving existing code into repo 2020-07-15 17:28:27 +00:00			`outfile.write(output.to_json)`
			`outfile.close`
			`end`

added csv output 2020-10-24 17:56:09 +00:00			`def write_output_csv (output, filename)`
			`CSV.open(filename, 'wb') do \|csv\|`
			`csv << ["bin", "words", "total"]`
added code to output master.csv 2020-10-26 19:44:16 +00:00			`output.delete(:filename)`
changed total word count behaviour 2020-11-21 18:08:58 +00:00			`output.delete(:total_words)`
added csv output 2020-10-24 17:56:09 +00:00			`output.each_key do \|key\|`
			`line = []`
			`line.push(key)`
			`output[key].each_key do \|sub_key\|`
			`line.push(output[key][sub_key])`
added code to output master.csv 2020-10-26 19:44:16 +00:00			`end`
added csv output 2020-10-24 17:56:09 +00:00			`csv << line`
			`end`
			`end`
			`end`

added strip_text method 2020-07-15 22:13:12 +00:00			`#strip_text expects:`
			`# text - the text we're working on`
			`# start - the starting string to search for`
			`# fin - the ending string to search for`
			`#This method strips out all test before "start" and after "fin"`
			`def strip_text (text, start, fin)`
			`text.lines(start,chomp: true)[1].lines(fin,chomp: true)[0]`
			`end`

cleaned up comments 2020-10-24 15:27:47 +00:00			`#split_text expects:`
			`# text - the text we're working on`
			`# start - the starting string to search for`
			`# fin the ending string to search for`
			`# This method returns everything between start and fin and handles multiple sections`
			`# Example:`
			`# For the following text:`
			`# > asdfasdf`
			`# > AAAAA`
			`# > testing`
			`# > abcdefg`
			`# > BBBBB`
			`# > nothing`
			`# > something`
			`# > AAAAA`
			`# > moo said the cow.`
			`# > cluck said the chicken.`
			`# > BBBBB`
			`# > bark said the dog`
			`# Running split_text(text, 'AAAAA', 'BBBBB') would return:`
			`# > testing`
			`# > abcdefg`
			`# > moo said the cow.`
			`# > cluck said the chicken.`
added split_text() and --type option 2020-10-24 15:04:04 +00:00			`def split_text (text, start, fin)`
			`split1 = text.lines(start, chomp: true)[1..-1]`
			`split2 = []`
			`ret = ""`
			`split1.each do \|section\|`
			`split2 += section.lines(fin, chomp: true)`
			`end`
			`split2.each do \|section\|`
			`ret += section`
			`end`
			`return ret`
			`end`

added directory/batch processing 2020-10-26 17:23:24 +00:00			`#process_file expects:`
			`# file_name - the name of the file to process`
			`# binfile - the name of the bin file (csv) to use`
			`# type - which type of file are we processing, must be 'pn' or 'iat'`
			`#`
			`#This method is the meat and potatos. Preforms the text stripping, word counting, and creates output files.`
moved file processing into a method to prepare for batch operation code 2020-10-26 16:56:52 +00:00			`def process_file (file_name, binfile, type)`
made pn option operate on each date section 2020-11-21 21:45:55 +00:00			`#text = split_text(text, 'Narrative:', 'Signatures:') if type == 'pn'`
moved file processing into a method to prepare for batch operation code 2020-10-26 16:56:52 +00:00			`csv = CSV.read(binfile)`
			`text = File.read(file_name)`
			`bins = Hash.new #This hash stores the bins`
made pn option operate on each date section 2020-11-21 21:45:55 +00:00			`csv.each { \|bin\| bins[bin[0]] = bin[1..].compact } #turn the csv array into a hash, remove nils`
moved file processing into a method to prepare for batch operation code 2020-10-26 16:56:52 +00:00			`outfile = file_name`
			`outfile.slice!('.txt')`
made pn option operate on each date section 2020-11-21 21:45:55 +00:00			`if type == 'iat'`
			`text = strip_text(text, 'PLOVEINTAKE', 'PLOVECLOSING') if type == 'iat'`
			`output = Hash.new #Creating the output storage object`
moved file processing into a method to prepare for batch operation code 2020-10-26 16:56:52 +00:00			`#puts outfile`
made pn option operate on each date section 2020-11-21 21:45:55 +00:00			`output[:filename] = outfile`
			`output[:total_words] = text.split.size`
			`bins.each_key do \|bin_number\|`
			`key = bin_number.to_sym`
			`output[key] = Hash.new`
			`output[key][:words] = bin_counter(bins[bin_number], text)`
			`output[key][:total] = count_total(output[key])`
added frequency to output 2020-11-21 22:32:04 +00:00			`output[key][:frequency] = output[key][:total].to_f / output[:total_words].to_f`
made pn option operate on each date section 2020-11-21 21:45:55 +00:00			`end`
			`write_output_json(output,outfile + '-out.json')`
			`write_output_csv(output,outfile + '-out.csv')`
			`elsif type == 'pn'`
			`sections = text.lines("Date and time:", chomp: true) #sections is an arrary of each date section from the text`
			`sections.delete_at(0) #we can ignore the first chunk of text`
			`sections.each do \|chunk\|`
			`timestamp = chunk.lines.delete_if {\|line\| line == "\r\n"}[0] #pulling out the timestamp`
minor changes to output and date format 2020-11-21 21:53:48 +00:00			`timestamp.tr!('/','-').tr!(':','').tr!(' ','_') #remove slashes and colons from timestamp, replaces spaces with unserscores`
made pn option operate on each date section 2020-11-21 21:45:55 +00:00			`timestamp.strip!`
			`output = Hash.new #Creating the output storage object`
			`outfile = file_name + '_' + timestamp`
			`outfile.slice!('.txt')`
			`text = strip_text(chunk, 'Narrative:', 'Signatures:')`
			`output[:filename] = outfile`
			`output[:total_words] = text.split.size`
			`bins.each_key do \|bin_number\|`
			`key = bin_number.to_sym`
			`output[key] = Hash.new`
			`output[key][:words] = bin_counter(bins[bin_number], text)`
			`output[key][:total] = count_total(output[key])`
added frequency to output 2020-11-21 22:32:04 +00:00			`output[key][:frequency] = output[key][:total].to_f / output[:total_words].to_f`
made pn option operate on each date section 2020-11-21 21:45:55 +00:00			`end`
			`write_output_json(output,outfile + '-out.json')`
			`write_output_csv(output,outfile + '-out.csv')`
			`end`
moved file processing into a method to prepare for batch operation code 2020-10-26 16:56:52 +00:00			`end`
added directory/batch processing 2020-10-26 17:23:24 +00:00			`end`
moved file processing into a method to prepare for batch operation code 2020-10-26 16:56:52 +00:00
added directory/batch processing 2020-10-26 17:23:24 +00:00			`#process_dir expects:`
			`# dir_name - a direcroty containing text files to process`
			`# binfile - the name of the bin file`
			`# type - which type of file are we processing, must be 'pn' or 'iat'`
			`#`
			`#This method will process all .txt files in the supplied directory`
			`def process_dir(dir_name, binfile, type)`
			`Dir.glob(dir_name + '*.txt') do \|file_name\|`
minor changes to output and date format 2020-11-21 21:53:48 +00:00			`puts "Processing " + file_name`
added directory/batch processing 2020-10-26 17:23:24 +00:00			`process_file(file_name, binfile, type)`
			`end`
moved file processing into a method to prepare for batch operation code 2020-10-26 16:56:52 +00:00			`end`

added code to output master.csv 2020-10-26 19:44:16 +00:00			`def generate_master_output(dir_name, binfile)`
			`file=File.open(binfile,"r")`
			`bin_count = file.readlines.size`
			`file.close`
			`bin_header = []`
			`bin_count.times do \|num\|`
			`num += 1`
			`words_head = "Bin " + num.to_s + " words"`
			`total_head = "Bin " + num.to_s + " total"`
added frequency to output 2020-11-21 22:32:04 +00:00			`freq_head = "Bin " + num.to_s + " frequency"`
added code to output master.csv 2020-10-26 19:44:16 +00:00			`bin_header.push(words_head)`
			`bin_header.push(total_head)`
added frequency to output 2020-11-21 22:32:04 +00:00			`bin_header.push(freq_head)`
added code to output master.csv 2020-10-26 19:44:16 +00:00			`end`
			`CSV.open('master.csv', 'wb') do \|csv\|`
			`header = ["File", "Total Words" ] + bin_header`
			`csv << header`
			`Dir.glob(dir_name + '*.json') do \|file_name\|`
minor changes to output and date format 2020-11-21 21:53:48 +00:00			`puts 'Getting data from: ' + file_name`
added code to output master.csv 2020-10-26 19:44:16 +00:00			`csv_row = []`
			`json_file = File.read(file_name)`
			`data_hash = JSON.parse(json_file)`
			`csv_row.push(data_hash["filename"])`
changed total word count behaviour 2020-11-21 18:08:58 +00:00			`csv_row.push(data_hash["total_words"])`
added code to output master.csv 2020-10-26 19:44:16 +00:00			`data_hash.delete("filename")`
changed total word count behaviour 2020-11-21 18:08:58 +00:00			`data_hash.delete("total_words")`
added code to output master.csv 2020-10-26 19:44:16 +00:00			`data_hash.each_key do \|key\|`
			`csv_row.push(data_hash[key]["words"])`
			`csv_row.push(data_hash[key]["total"])`
added frequency to output 2020-11-21 22:32:04 +00:00			`csv_row.push(data_hash[key]["frequency"])`
added code to output master.csv 2020-10-26 19:44:16 +00:00			`end`
			`csv << csv_row`
			`end`
			`end`
			`end`

accept file from stdin 2020-07-21 22:56:46 +00:00			`options = Hash.new`
			`OptionParser.new do \|opts\|`
			`opts.banner = 'sorter.rb --options'`
			`opts.on("-f", "--file file", "Name of the file to process") do \|file\|`
			`options[:file] = file`
			`end`
added split_text() and --type option 2020-10-24 15:04:04 +00:00			`opts.on("-t", "--type type", "Type of file. Must be \"iat\" or \"pn\"") do \|type\|`
			`options[:type] = type`
			`end`
added --bins option 2020-10-24 15:13:43 +00:00			`opts.on("-b", "--bin-file binfile", "Name of the bin file") do \|binfile\|`
			`options[:binfile] = binfile`
			`end`
added directory/batch processing 2020-10-26 17:23:24 +00:00			`opts.on("-d", "--directory dir", "Directory containing text files to process") do \|dir\|`
moved file processing into a method to prepare for batch operation code 2020-10-26 16:56:52 +00:00			`options[:dir] = dir`
			`end`
accept file from stdin 2020-07-21 22:56:46 +00:00			`end.parse!`

moved file processing into a method to prepare for batch operation code 2020-10-26 16:56:52 +00:00			`if options[:file] && options[:dir]`
			`puts "Invalid options, you must either a file or a directoy of files."`
			`elsif options[:file]`
			`process_file(options[:file], options[:binfile], options[:type])`
added directory/batch processing 2020-10-26 17:23:24 +00:00			`elsif options[:dir]`
			`process_dir(options[:dir], options[:binfile], options[:type])`
added code to output master.csv 2020-10-26 19:44:16 +00:00			`generate_master_output(options[:dir], options[:binfile])`
moving existing code into repo 2020-07-15 17:28:27 +00:00			`end`