Compare commits

...

4 Commits

Author SHA1 Message Date
Jeff Yates a2f3259fdf fixed merge conflict 2020-11-21 17:09:08 -05:00
Jeff Yates fd88c3457c minor changes to output and date format 2020-11-21 16:53:48 -05:00
Jeff Yates 9a9886df9d made pn option operate on each date section 2020-11-21 16:45:55 -05:00
Jeff Yates 174effc0fb changed total word count behaviour 2020-11-21 13:08:58 -05:00
1 changed files with 49 additions and 11 deletions

View File

@ -13,6 +13,9 @@
# The output file is single-line JSON, use jq to format for reading. #
# #
#To Do: #
# * Fix word count #
# * add frequency #
# * split by dates for progress notes #
############################################################################
require 'json'
require 'pp'
@ -55,6 +58,7 @@ def write_output_csv (output, filename)
CSV.open(filename, 'wb') do |csv|
csv << ["bin", "words", "total"]
output.delete(:filename)
output.delete(:total_words)
output.each_key do |key|
line = []
line.push(key)
@ -119,14 +123,51 @@ end
#
#This method is the meat and potatos. Preforms the text stripping, word counting, and creates output files.
def process_file (file_name, binfile, type)
#text = split_text(text, 'Narrative:', 'Signatures:') if type == 'pn'
csv = CSV.read(binfile)
text = File.read(file_name)
text = strip_text(text, 'PLOVEINTAKE', 'PLOVECLOSING') if type == 'iat'
text = split_text(text, 'Narrative:', 'Signatures:') if type == 'pn'
output = Hash.new #Creating the output storage object
bins = Hash.new #This hash stores the bins
csv.each { |bin| bins[bin[0]] = bin[1..].compact } #turn the csv array into a hash, remove nils
outfile = file_name
outfile.slice!('.txt')
<<<<<<< HEAD
if type == 'iat'
text = strip_text(text, 'PLOVEINTAKE', 'PLOVECLOSING') if type == 'iat'
output = Hash.new #Creating the output storage object
#puts outfile
output[:filename] = outfile
output[:total_words] = text.split.size
bins.each_key do |bin_number|
key = bin_number.to_sym
output[key] = Hash.new
output[key][:words] = bin_counter(bins[bin_number], text)
output[key][:total] = count_total(output[key])
end
write_output_json(output,outfile + '-out.json')
write_output_csv(output,outfile + '-out.csv')
elsif type == 'pn'
sections = text.lines("Date and time:", chomp: true) #sections is an arrary of each date section from the text
sections.delete_at(0) #we can ignore the first chunk of text
sections.each do |chunk|
timestamp = chunk.lines.delete_if {|line| line == "\r\n"}[0] #pulling out the timestamp
timestamp.tr!('/','-').tr!(':','').tr!(' ','_') #remove slashes and colons from timestamp, replaces spaces with unserscores
timestamp.strip!
output = Hash.new #Creating the output storage object
outfile = file_name + '_' + timestamp
outfile.slice!('.txt')
text = strip_text(chunk, 'Narrative:', 'Signatures:')
output[:filename] = outfile
output[:total_words] = text.split.size
bins.each_key do |bin_number|
key = bin_number.to_sym
output[key] = Hash.new
output[key][:words] = bin_counter(bins[bin_number], text)
output[key][:total] = count_total(output[key])
end
write_output_json(output,outfile + '-out.json')
write_output_csv(output,outfile + '-out.csv')
end
=======
puts outfile
output[:filename] = outfile
csv.each { |bin| bins[bin[0]] = bin[1..].compact } #turn the csv array into a hash, remove nils
@ -135,9 +176,8 @@ def process_file (file_name, binfile, type)
output[key] = Hash.new
output[key][:words] = bin_counter(bins[bin_number], text)
output[key][:total] = count_total(output[key])
>>>>>>> d40b0ae9853ecb6d5d479ea121a7a3cdba00323c
end
write_output_json(output,outfile + '-out.json')
write_output_csv(output,outfile + '-out.csv')
end
#process_dir expects:
@ -148,13 +188,12 @@ end
#This method will process all .txt files in the supplied directory
def process_dir(dir_name, binfile, type)
Dir.glob(dir_name + '*.txt') do |file_name|
puts "Processing" + file_name
puts "Processing " + file_name
process_file(file_name, binfile, type)
end
end
def generate_master_output(dir_name, binfile)
puts dir_name
file=File.open(binfile,"r")
bin_count = file.readlines.size
file.close
@ -170,19 +209,18 @@ def generate_master_output(dir_name, binfile)
header = ["File", "Total Words" ] + bin_header
csv << header
Dir.glob(dir_name + '*.json') do |file_name|
puts file_name
puts 'Getting data from: ' + file_name
csv_row = []
json_file = File.read(file_name)
data_hash = JSON.parse(json_file)
csv_row.push(data_hash["filename"])
csv_row.push(data_hash["total_words"])
data_hash.delete("filename")
word_total = 0
data_hash.delete("total_words")
data_hash.each_key do |key|
csv_row.push(data_hash[key]["words"])
csv_row.push(data_hash[key]["total"])
word_total += data_hash[key]["total"]
end
csv_row = csv_row.insert(1, word_total)
csv << csv_row
end
end