You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
256 lines
8.5 KiB
256 lines
8.5 KiB
#!/bin/env ruby |
|
############################################################################ |
|
#Word sorter for Pat's dissertation # |
|
#More documentation TBD # |
|
#Current Usage: # |
|
# Scans a file for groups of words and counts the totals # |
|
# Accepts the following options: # |
|
# --file - the name of the text to sort # |
|
# --type - iat or pn # |
|
# --bin-file - the name of the bin csv file # |
|
# # |
|
# The input file is human readable, easy to edit. # |
|
# The output file is single-line JSON, use jq to format for reading. # |
|
# # |
|
#To Do: # |
|
# * add frequency # |
|
############################################################################ |
|
require 'json' |
|
require 'pp' |
|
require 'csv' |
|
require 'optparse' |
|
|
|
#bin_counter expects: |
|
# bin - a hash with strings to search for |
|
# test - the document text to search through |
|
#This method returns a hash containing the strings and their frequency |
|
def bin_counter (bin, text) |
|
ret = Hash.new |
|
bin.each do |word| |
|
ret[word.to_sym] = text.scan(word).count.to_i |
|
end |
|
return ret |
|
end |
|
|
|
#count_total expects: |
|
# bin - a hash created by bin_counter |
|
#This method returns a hash with the total count of all words in a bin |
|
def count_total (bin) |
|
count = 0 |
|
bin[:words].each_key do |word| |
|
count += bin[:words][word].to_i |
|
end |
|
return count |
|
end |
|
|
|
#write_output expects: |
|
# output - a hash containing all of our output |
|
#This method converts the output hash to JSON and writes it to "output.json" |
|
def write_output_json (output, filename) |
|
begin |
|
outfile = File.open(filename,'w') |
|
outfile.write(output.to_json) |
|
outfile.close |
|
rescue |
|
pp output |
|
abort |
|
end |
|
end |
|
|
|
def write_output_csv (output, filename) |
|
CSV.open(filename, 'wb') do |csv| |
|
csv << ["bin", "words", "total"] |
|
output.delete(:filename) |
|
output.delete(:total_words) |
|
output.each_key do |key| |
|
line = [] |
|
line.push(key) |
|
output[key].each_key do |sub_key| |
|
line.push(output[key][sub_key]) |
|
end |
|
csv << line |
|
end |
|
end |
|
end |
|
|
|
#strip_text expects: |
|
# text - the text we're working on |
|
# start - the starting string to search for |
|
# fin - the ending string to search for |
|
#This method strips out all test before "start" and after "fin" |
|
def strip_text (text, start, fin) |
|
text.lines(start,chomp: true)[1].lines(fin,chomp: true)[0] |
|
end |
|
|
|
#split_text expects: |
|
# text - the text we're working on |
|
# start - the starting string to search for |
|
# fin the ending string to search for |
|
# This method returns everything between start and fin and handles multiple sections |
|
# Example: |
|
# For the following text: |
|
# > asdfasdf |
|
# > AAAAA |
|
# > testing |
|
# > abcdefg |
|
# > BBBBB |
|
# > nothing |
|
# > something |
|
# > AAAAA |
|
# > moo said the cow. |
|
# > cluck said the chicken. |
|
# > BBBBB |
|
# > bark said the dog |
|
# Running split_text(text, 'AAAAA', 'BBBBB') would return: |
|
# > testing |
|
# > abcdefg |
|
# > moo said the cow. |
|
# > cluck said the chicken. |
|
def split_text (text, start, fin) |
|
split1 = text.lines(start, chomp: true)[1..-1] |
|
split2 = [] |
|
ret = "" |
|
split1.each do |section| |
|
split2 += section.lines(fin, chomp: true) |
|
end |
|
split2.each do |section| |
|
ret += section |
|
end |
|
return ret |
|
end |
|
|
|
#process_file expects: |
|
# file_name - the name of the file to process |
|
# binfile - the name of the bin file (csv) to use |
|
# type - which type of file are we processing, must be 'pn' or 'iat' |
|
# |
|
#This method is the meat and potatos. Preforms the text stripping, word counting, and creates output files. |
|
def process_file (file_name, binfile, type) |
|
#text = split_text(text, 'Narrative:', 'Signatures:') if type == 'pn' |
|
csv = CSV.read(binfile) |
|
text = File.read(file_name) |
|
bins = Hash.new #This hash stores the bins |
|
csv.each { |bin| bins[bin[0]] = bin[1..].compact } #turn the csv array into a hash, remove nils |
|
outfile = file_name |
|
outfile.slice!('.txt') |
|
if type == 'iat' |
|
text = strip_text(text, 'PLOVEINTAKE', 'PLOVECLOSING') if type == 'iat' |
|
output = Hash.new #Creating the output storage object |
|
#puts outfile |
|
output[:filename] = outfile |
|
output[:total_words] = text.split.size |
|
bins.each_key do |bin_number| |
|
key = bin_number.to_sym |
|
output[key] = Hash.new |
|
output[key][:words] = bin_counter(bins[bin_number], text) |
|
output[key][:total] = count_total(output[key]) |
|
output[key][:frequency] = output[key][:total].to_f / output[:total_words].to_f |
|
end |
|
write_output_json(output,outfile + '-out.json') |
|
write_output_csv(output,outfile + '-out.csv') |
|
elsif type == 'pn' |
|
sections = text.lines("Date and time:", chomp: true) #sections is an arrary of each date section from the text |
|
sections.delete_at(0) #we can ignore the first chunk of text |
|
sections.each do |chunk| |
|
begin |
|
timestamp = chunk.lines.delete_if {|line| line == "\r\n"}[0] #pulling out the timestamp |
|
timestamp.tr!('/','-').tr!(':','').tr!(' ','_') #remove slashes and colons from timestamp, replaces spaces with unserscores |
|
rescue |
|
pp timestamp |
|
abort |
|
end |
|
timestamp.strip! |
|
output = Hash.new #Creating the output storage object |
|
outfile = file_name + '_' + timestamp |
|
outfile.slice!('.txt') |
|
text = strip_text(chunk, 'Narrative:', 'Signatures:') |
|
output[:filename] = outfile |
|
output[:total_words] = text.split.size |
|
bins.each_key do |bin_number| |
|
key = bin_number.to_sym |
|
output[key] = Hash.new |
|
output[key][:words] = bin_counter(bins[bin_number], text) |
|
output[key][:total] = count_total(output[key]) |
|
output[key][:frequency] = output[key][:total].to_f / output[:total_words].to_f |
|
end |
|
write_output_json(output,outfile + '-out.json') |
|
write_output_csv(output,outfile + '-out.csv') |
|
end |
|
end |
|
end |
|
|
|
#process_dir expects: |
|
# dir_name - a direcroty containing text files to process |
|
# binfile - the name of the bin file |
|
# type - which type of file are we processing, must be 'pn' or 'iat' |
|
# |
|
#This method will process all .txt files in the supplied directory |
|
def process_dir(dir_name, binfile, type) |
|
Dir.glob(dir_name + '*.txt') do |file_name| |
|
puts "Processing " + file_name |
|
process_file(file_name, binfile, type) |
|
end |
|
end |
|
|
|
def generate_master_output(dir_name, binfile) |
|
file=File.open(binfile,"r") |
|
bin_count = file.readlines.size |
|
file.close |
|
bin_header = [] |
|
bin_count.times do |num| |
|
num += 1 |
|
words_head = "Bin " + num.to_s + " words" |
|
total_head = "Bin " + num.to_s + " total" |
|
freq_head = "Bin " + num.to_s + " frequency" |
|
bin_header.push(words_head) |
|
bin_header.push(total_head) |
|
bin_header.push(freq_head) |
|
end |
|
CSV.open('master.csv', 'wb') do |csv| |
|
header = ["File", "Total Words" ] + bin_header |
|
csv << header |
|
Dir.glob(dir_name + '*.json') do |file_name| |
|
puts 'Getting data from: ' + file_name |
|
csv_row = [] |
|
json_file = File.read(file_name) |
|
data_hash = JSON.parse(json_file) |
|
csv_row.push(data_hash["filename"]) |
|
csv_row.push(data_hash["total_words"]) |
|
data_hash.delete("filename") |
|
data_hash.delete("total_words") |
|
data_hash.each_key do |key| |
|
csv_row.push(data_hash[key]["words"]) |
|
csv_row.push(data_hash[key]["total"]) |
|
csv_row.push(data_hash[key]["frequency"]) |
|
end |
|
csv << csv_row |
|
end |
|
end |
|
end |
|
|
|
options = Hash.new |
|
OptionParser.new do |opts| |
|
opts.banner = 'sorter.rb --options' |
|
opts.on("-f", "--file file", "Name of the file to process") do |file| |
|
options[:file] = file |
|
end |
|
opts.on("-t", "--type type", "Type of file. Must be \"iat\" or \"pn\"") do |type| |
|
options[:type] = type |
|
end |
|
opts.on("-b", "--bin-file binfile", "Name of the bin file") do |binfile| |
|
options[:binfile] = binfile |
|
end |
|
opts.on("-d", "--directory dir", "Directory containing text files to process") do |dir| |
|
options[:dir] = dir |
|
end |
|
end.parse! |
|
|
|
if options[:file] && options[:dir] |
|
puts "Invalid options, you must either a file or a directoy of files." |
|
elsif options[:file] |
|
process_file(options[:file], options[:binfile], options[:type]) |
|
elsif options[:dir] |
|
process_dir(options[:dir], options[:binfile], options[:type]) |
|
generate_master_output(options[:dir], options[:binfile]) |
|
end |
|
|
|
|