Compare commits
No commits in common. "4e1b862586a8f4ff83501d5970d5be9f04a8f11e" and "79e5bc63e1e71745da9cb9eb84470956d1a113dd" have entirely different histories.
4e1b862586
...
79e5bc63e1
112
sorter.rb
112
sorter.rb
|
@ -13,6 +13,7 @@
|
||||||
# The output file is single-line JSON, use jq to format for reading. #
|
# The output file is single-line JSON, use jq to format for reading. #
|
||||||
# #
|
# #
|
||||||
#To Do: #
|
#To Do: #
|
||||||
|
# * Output to csv #
|
||||||
############################################################################
|
############################################################################
|
||||||
require 'json'
|
require 'json'
|
||||||
require 'pp'
|
require 'pp'
|
||||||
|
@ -54,13 +55,12 @@ end
|
||||||
def write_output_csv (output, filename)
|
def write_output_csv (output, filename)
|
||||||
CSV.open(filename, 'wb') do |csv|
|
CSV.open(filename, 'wb') do |csv|
|
||||||
csv << ["bin", "words", "total"]
|
csv << ["bin", "words", "total"]
|
||||||
output.delete(:filename)
|
|
||||||
output.each_key do |key|
|
output.each_key do |key|
|
||||||
line = []
|
line = []
|
||||||
line.push(key)
|
line.push(key)
|
||||||
output[key].each_key do |sub_key|
|
output[key].each_key do |sub_key|
|
||||||
line.push(output[key][sub_key])
|
line.push(output[key][sub_key])
|
||||||
end
|
end
|
||||||
csv << line
|
csv << line
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
@ -112,82 +112,6 @@ def split_text (text, start, fin)
|
||||||
return ret
|
return ret
|
||||||
end
|
end
|
||||||
|
|
||||||
#process_file expects:
|
|
||||||
# file_name - the name of the file to process
|
|
||||||
# binfile - the name of the bin file (csv) to use
|
|
||||||
# type - which type of file are we processing, must be 'pn' or 'iat'
|
|
||||||
#
|
|
||||||
#This method is the meat and potatos. Preforms the text stripping, word counting, and creates output files.
|
|
||||||
def process_file (file_name, binfile, type)
|
|
||||||
csv = CSV.read(binfile)
|
|
||||||
text = File.read(file_name)
|
|
||||||
text = strip_text(text, 'PLOVEINTAKE', 'PLOVECLOSING') if type == 'iat'
|
|
||||||
text = split_text(text, 'Narrative:', 'Signatures:') if type == 'pn'
|
|
||||||
output = Hash.new #Creating the output storage object
|
|
||||||
bins = Hash.new #This hash stores the bins
|
|
||||||
outfile = file_name
|
|
||||||
outfile.slice!('.txt')
|
|
||||||
#puts outfile
|
|
||||||
output[:filename] = outfile
|
|
||||||
csv.each { |bin| bins[bin[0]] = bin[1..].compact } #turn the csv array into a hash, remove nils
|
|
||||||
bins.each_key do |bin_number|
|
|
||||||
key = bin_number.to_sym
|
|
||||||
output[key] = Hash.new
|
|
||||||
output[key][:words] = bin_counter(bins[bin_number], text)
|
|
||||||
output[key][:total] = count_total(output[key])
|
|
||||||
end
|
|
||||||
write_output_json(output,outfile + '-out.json')
|
|
||||||
write_output_csv(output,outfile + '-out.csv')
|
|
||||||
end
|
|
||||||
|
|
||||||
#process_dir expects:
|
|
||||||
# dir_name - a direcroty containing text files to process
|
|
||||||
# binfile - the name of the bin file
|
|
||||||
# type - which type of file are we processing, must be 'pn' or 'iat'
|
|
||||||
#
|
|
||||||
#This method will process all .txt files in the supplied directory
|
|
||||||
def process_dir(dir_name, binfile, type)
|
|
||||||
Dir.glob(dir_name + '*.txt') do |file_name|
|
|
||||||
puts "Processing" + file_name
|
|
||||||
process_file(file_name, binfile, type)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
def generate_master_output(dir_name, binfile)
|
|
||||||
puts dir_name
|
|
||||||
file=File.open(binfile,"r")
|
|
||||||
bin_count = file.readlines.size
|
|
||||||
file.close
|
|
||||||
bin_header = []
|
|
||||||
bin_count.times do |num|
|
|
||||||
num += 1
|
|
||||||
words_head = "Bin " + num.to_s + " words"
|
|
||||||
total_head = "Bin " + num.to_s + " total"
|
|
||||||
bin_header.push(words_head)
|
|
||||||
bin_header.push(total_head)
|
|
||||||
end
|
|
||||||
CSV.open('master.csv', 'wb') do |csv|
|
|
||||||
header = ["File", "Total Words" ] + bin_header
|
|
||||||
csv << header
|
|
||||||
Dir.glob(dir_name + '*.json') do |file_name|
|
|
||||||
puts file_name
|
|
||||||
csv_row = []
|
|
||||||
json_file = File.read(file_name)
|
|
||||||
data_hash = JSON.parse(json_file)
|
|
||||||
csv_row.push(data_hash["filename"])
|
|
||||||
data_hash.delete("filename")
|
|
||||||
word_total = 0
|
|
||||||
data_hash.each_key do |key|
|
|
||||||
csv_row.push(data_hash[key]["words"])
|
|
||||||
csv_row.push(data_hash[key]["total"])
|
|
||||||
word_total += data_hash[key]["total"]
|
|
||||||
end
|
|
||||||
csv_row = csv_row.insert(1, word_total)
|
|
||||||
csv << csv_row
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
options = Hash.new
|
options = Hash.new
|
||||||
OptionParser.new do |opts|
|
OptionParser.new do |opts|
|
||||||
opts.banner = 'sorter.rb --options'
|
opts.banner = 'sorter.rb --options'
|
||||||
|
@ -200,17 +124,27 @@ OptionParser.new do |opts|
|
||||||
opts.on("-b", "--bin-file binfile", "Name of the bin file") do |binfile|
|
opts.on("-b", "--bin-file binfile", "Name of the bin file") do |binfile|
|
||||||
options[:binfile] = binfile
|
options[:binfile] = binfile
|
||||||
end
|
end
|
||||||
opts.on("-d", "--directory dir", "Directory containing text files to process") do |dir|
|
|
||||||
options[:dir] = dir
|
|
||||||
end
|
|
||||||
end.parse!
|
end.parse!
|
||||||
|
|
||||||
if options[:file] && options[:dir]
|
|
||||||
puts "Invalid options, you must either a file or a directoy of files."
|
csv = CSV.read(options[:binfile])
|
||||||
elsif options[:file]
|
text = File.read(options[:file])
|
||||||
process_file(options[:file], options[:binfile], options[:type])
|
text = strip_text(text, 'PLOVEINTAKE', 'PLOVECLOSING') if options[:type] == 'iat'
|
||||||
elsif options[:dir]
|
text = split_text(text, 'Narrative:', 'Signatures:') if options[:type] == 'pn'
|
||||||
process_dir(options[:dir], options[:binfile], options[:type])
|
output = Hash.new #Creating the output storage object
|
||||||
generate_master_output(options[:dir], options[:binfile])
|
bins = Hash.new #This hash stores the bins
|
||||||
end
|
outfile = options[:file]
|
||||||
|
outfile.slice!('.txt')
|
||||||
|
puts outfile
|
||||||
|
|
||||||
|
csv.each { |bin| bins[bin[0]] = bin[1..].compact } #turn the csv array into a hash, remove nils
|
||||||
|
|
||||||
|
bins.each_key do |bin_number|
|
||||||
|
key = bin_number.to_sym
|
||||||
|
output[key] = Hash.new
|
||||||
|
output[key][:words] = bin_counter(bins[bin_number], text)
|
||||||
|
output[key][:total] = count_total(output[key])
|
||||||
|
end
|
||||||
|
write_output_json(output,outfile + '-out.json')
|
||||||
|
write_output_csv(output,outfile + '-out.csv')
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue