#!/bin/env ruby ############################################################################ #Word sorter for Pat's dissertation # #More documentation TBD # #Current Usage: # # Scans a file for groups of words and counts the totals # # Accepts the following options: # # --file - the name of the text to sort # # --type - iat or pn # # --bin-file - the name of the bin csv file # # # # The input file is human readable, easy to edit. # # The output file is single-line JSON, use jq to format for reading. # # # #To Do: # # * add frequency # ############################################################################ require 'json' require 'pp' require 'csv' require 'optparse' #bin_counter expects: # bin - a hash with strings to search for # test - the document text to search through #This method returns a hash containing the strings and their frequency def bin_counter (bin, text) ret = Hash.new bin.each do |word| ret[word.to_sym] = text.scan(word).count.to_i end return ret end #count_total expects: # bin - a hash created by bin_counter #This method returns a hash with the total count of all words in a bin def count_total (bin) count = 0 bin[:words].each_key do |word| count += bin[:words][word].to_i end return count end #write_output expects: # output - a hash containing all of our output #This method converts the output hash to JSON and writes it to "output.json" def write_output_json (output, filename) outfile = File.open(filename,'w') outfile.write(output.to_json) outfile.close end def write_output_csv (output, filename) CSV.open(filename, 'wb') do |csv| csv << ["bin", "words", "total"] output.delete(:filename) output.delete(:total_words) output.each_key do |key| line = [] line.push(key) output[key].each_key do |sub_key| line.push(output[key][sub_key]) end csv << line end end end #strip_text expects: # text - the text we're working on # start - the starting string to search for # fin - the ending string to search for #This method strips out all test before "start" and after "fin" def strip_text (text, start, fin) text.lines(start,chomp: true)[1].lines(fin,chomp: true)[0] end #split_text expects: # text - the text we're working on # start - the starting string to search for # fin the ending string to search for # This method returns everything between start and fin and handles multiple sections # Example: # For the following text: # > asdfasdf # > AAAAA # > testing # > abcdefg # > BBBBB # > nothing # > something # > AAAAA # > moo said the cow. # > cluck said the chicken. # > BBBBB # > bark said the dog # Running split_text(text, 'AAAAA', 'BBBBB') would return: # > testing # > abcdefg # > moo said the cow. # > cluck said the chicken. def split_text (text, start, fin) split1 = text.lines(start, chomp: true)[1..-1] split2 = [] ret = "" split1.each do |section| split2 += section.lines(fin, chomp: true) end split2.each do |section| ret += section end return ret end #process_file expects: # file_name - the name of the file to process # binfile - the name of the bin file (csv) to use # type - which type of file are we processing, must be 'pn' or 'iat' # #This method is the meat and potatos. Preforms the text stripping, word counting, and creates output files. def process_file (file_name, binfile, type) #text = split_text(text, 'Narrative:', 'Signatures:') if type == 'pn' csv = CSV.read(binfile) text = File.read(file_name) bins = Hash.new #This hash stores the bins csv.each { |bin| bins[bin[0]] = bin[1..].compact } #turn the csv array into a hash, remove nils outfile = file_name outfile.slice!('.txt') if type == 'iat' text = strip_text(text, 'PLOVEINTAKE', 'PLOVECLOSING') if type == 'iat' output = Hash.new #Creating the output storage object #puts outfile output[:filename] = outfile output[:total_words] = text.split.size bins.each_key do |bin_number| key = bin_number.to_sym output[key] = Hash.new output[key][:words] = bin_counter(bins[bin_number], text) output[key][:total] = count_total(output[key]) output[key][:frequency] = output[key][:total].to_f / output[:total_words].to_f end write_output_json(output,outfile + '-out.json') write_output_csv(output,outfile + '-out.csv') elsif type == 'pn' sections = text.lines("Date and time:", chomp: true) #sections is an arrary of each date section from the text sections.delete_at(0) #we can ignore the first chunk of text sections.each do |chunk| timestamp = chunk.lines.delete_if {|line| line == "\r\n"}[0] #pulling out the timestamp timestamp.tr!('/','-').tr!(':','').tr!(' ','_') #remove slashes and colons from timestamp, replaces spaces with underscores timestamp.strip! output = Hash.new #Creating the output storage object outfile = file_name + '_' + timestamp outfile.slice!('.txt') text = strip_text(chunk, 'Narrative:', 'Signatures:') output[:filename] = outfile output[:total_words] = text.split.size bins.each_key do |bin_number| key = bin_number.to_sym output[key] = Hash.new output[key][:words] = bin_counter(bins[bin_number], text) output[key][:total] = count_total(output[key]) output[key][:frequency] = output[key][:total].to_f / output[:total_words].to_f end write_output_json(output,outfile + '-out.json') write_output_csv(output,outfile + '-out.csv') end end end #process_dir expects: # dir_name - a direcroty containing text files to process # binfile - the name of the bin file # type - which type of file are we processing, must be 'pn' or 'iat' # #This method will process all .txt files in the supplied directory def process_dir(dir_name, binfile, type) threads = [] Dir.glob(dir_name + '*.txt') do |file_name| threads << Thread.new do puts "Processing " + file_name process_file(file_name, binfile, type) end end threads.each { |thr| thr.join } end def generate_master_output(dir_name, binfile) file=File.open(binfile,"r") bin_count = file.readlines.size file.close bin_header = [] bin_count.times do |num| num += 1 words_head = "Bin " + num.to_s + " words" total_head = "Bin " + num.to_s + " total" freq_head = "Bin " + num.to_s + " frequency" bin_header.push(words_head) bin_header.push(total_head) bin_header.push(freq_head) end CSV.open('master.csv', 'wb') do |csv| header = ["File", "Total Words" ] + bin_header csv << header Dir.glob(dir_name + '*.json') do |file_name| puts 'Getting data from: ' + file_name csv_row = [] json_file = File.read(file_name) data_hash = JSON.parse(json_file) csv_row.push(data_hash["filename"]) csv_row.push(data_hash["total_words"]) data_hash.delete("filename") data_hash.delete("total_words") data_hash.each_key do |key| csv_row.push(data_hash[key]["words"]) csv_row.push(data_hash[key]["total"]) csv_row.push(data_hash[key]["frequency"]) end csv << csv_row end end end options = Hash.new OptionParser.new do |opts| opts.banner = 'sorter.rb --options' opts.on("-f", "--file file", "Name of the file to process") do |file| options[:file] = file end opts.on("-t", "--type type", "Type of file. Must be \"iat\" or \"pn\"") do |type| options[:type] = type end opts.on("-b", "--bin-file binfile", "Name of the bin file") do |binfile| options[:binfile] = binfile end opts.on("-d", "--directory dir", "Directory containing text files to process") do |dir| options[:dir] = dir end end.parse! if options[:file] && options[:dir] puts "Invalid options, you must either a file or a directoy of files." elsif options[:file] process_file(options[:file], options[:binfile], options[:type]) elsif options[:dir] process_dir(options[:dir], options[:binfile], options[:type]) generate_master_output(options[:dir], options[:binfile]) end