Compare commits
4 Commits
d40b0ae985
...
a2f3259fdf
Author | SHA1 | Date |
---|---|---|
|
a2f3259fdf | |
|
fd88c3457c | |
|
9a9886df9d | |
|
174effc0fb |
58
sorter.rb
58
sorter.rb
|
@ -13,6 +13,9 @@
|
||||||
# The output file is single-line JSON, use jq to format for reading. #
|
# The output file is single-line JSON, use jq to format for reading. #
|
||||||
# #
|
# #
|
||||||
#To Do: #
|
#To Do: #
|
||||||
|
# * Fix word count #
|
||||||
|
# * add frequency #
|
||||||
|
# * split by dates for progress notes #
|
||||||
############################################################################
|
############################################################################
|
||||||
require 'json'
|
require 'json'
|
||||||
require 'pp'
|
require 'pp'
|
||||||
|
@ -55,6 +58,7 @@ def write_output_csv (output, filename)
|
||||||
CSV.open(filename, 'wb') do |csv|
|
CSV.open(filename, 'wb') do |csv|
|
||||||
csv << ["bin", "words", "total"]
|
csv << ["bin", "words", "total"]
|
||||||
output.delete(:filename)
|
output.delete(:filename)
|
||||||
|
output.delete(:total_words)
|
||||||
output.each_key do |key|
|
output.each_key do |key|
|
||||||
line = []
|
line = []
|
||||||
line.push(key)
|
line.push(key)
|
||||||
|
@ -119,17 +123,20 @@ end
|
||||||
#
|
#
|
||||||
#This method is the meat and potatos. Preforms the text stripping, word counting, and creates output files.
|
#This method is the meat and potatos. Preforms the text stripping, word counting, and creates output files.
|
||||||
def process_file (file_name, binfile, type)
|
def process_file (file_name, binfile, type)
|
||||||
|
#text = split_text(text, 'Narrative:', 'Signatures:') if type == 'pn'
|
||||||
csv = CSV.read(binfile)
|
csv = CSV.read(binfile)
|
||||||
text = File.read(file_name)
|
text = File.read(file_name)
|
||||||
text = strip_text(text, 'PLOVEINTAKE', 'PLOVECLOSING') if type == 'iat'
|
|
||||||
text = split_text(text, 'Narrative:', 'Signatures:') if type == 'pn'
|
|
||||||
output = Hash.new #Creating the output storage object
|
|
||||||
bins = Hash.new #This hash stores the bins
|
bins = Hash.new #This hash stores the bins
|
||||||
|
csv.each { |bin| bins[bin[0]] = bin[1..].compact } #turn the csv array into a hash, remove nils
|
||||||
outfile = file_name
|
outfile = file_name
|
||||||
outfile.slice!('.txt')
|
outfile.slice!('.txt')
|
||||||
puts outfile
|
<<<<<<< HEAD
|
||||||
|
if type == 'iat'
|
||||||
|
text = strip_text(text, 'PLOVEINTAKE', 'PLOVECLOSING') if type == 'iat'
|
||||||
|
output = Hash.new #Creating the output storage object
|
||||||
|
#puts outfile
|
||||||
output[:filename] = outfile
|
output[:filename] = outfile
|
||||||
csv.each { |bin| bins[bin[0]] = bin[1..].compact } #turn the csv array into a hash, remove nils
|
output[:total_words] = text.split.size
|
||||||
bins.each_key do |bin_number|
|
bins.each_key do |bin_number|
|
||||||
key = bin_number.to_sym
|
key = bin_number.to_sym
|
||||||
output[key] = Hash.new
|
output[key] = Hash.new
|
||||||
|
@ -138,6 +145,39 @@ def process_file (file_name, binfile, type)
|
||||||
end
|
end
|
||||||
write_output_json(output,outfile + '-out.json')
|
write_output_json(output,outfile + '-out.json')
|
||||||
write_output_csv(output,outfile + '-out.csv')
|
write_output_csv(output,outfile + '-out.csv')
|
||||||
|
elsif type == 'pn'
|
||||||
|
sections = text.lines("Date and time:", chomp: true) #sections is an arrary of each date section from the text
|
||||||
|
sections.delete_at(0) #we can ignore the first chunk of text
|
||||||
|
sections.each do |chunk|
|
||||||
|
timestamp = chunk.lines.delete_if {|line| line == "\r\n"}[0] #pulling out the timestamp
|
||||||
|
timestamp.tr!('/','-').tr!(':','').tr!(' ','_') #remove slashes and colons from timestamp, replaces spaces with unserscores
|
||||||
|
timestamp.strip!
|
||||||
|
output = Hash.new #Creating the output storage object
|
||||||
|
outfile = file_name + '_' + timestamp
|
||||||
|
outfile.slice!('.txt')
|
||||||
|
text = strip_text(chunk, 'Narrative:', 'Signatures:')
|
||||||
|
output[:filename] = outfile
|
||||||
|
output[:total_words] = text.split.size
|
||||||
|
bins.each_key do |bin_number|
|
||||||
|
key = bin_number.to_sym
|
||||||
|
output[key] = Hash.new
|
||||||
|
output[key][:words] = bin_counter(bins[bin_number], text)
|
||||||
|
output[key][:total] = count_total(output[key])
|
||||||
|
end
|
||||||
|
write_output_json(output,outfile + '-out.json')
|
||||||
|
write_output_csv(output,outfile + '-out.csv')
|
||||||
|
end
|
||||||
|
=======
|
||||||
|
puts outfile
|
||||||
|
output[:filename] = outfile
|
||||||
|
csv.each { |bin| bins[bin[0]] = bin[1..].compact } #turn the csv array into a hash, remove nils
|
||||||
|
bins.each_key do |bin_number|
|
||||||
|
key = bin_number.to_sym
|
||||||
|
output[key] = Hash.new
|
||||||
|
output[key][:words] = bin_counter(bins[bin_number], text)
|
||||||
|
output[key][:total] = count_total(output[key])
|
||||||
|
>>>>>>> d40b0ae9853ecb6d5d479ea121a7a3cdba00323c
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
#process_dir expects:
|
#process_dir expects:
|
||||||
|
@ -154,7 +194,6 @@ def process_dir(dir_name, binfile, type)
|
||||||
end
|
end
|
||||||
|
|
||||||
def generate_master_output(dir_name, binfile)
|
def generate_master_output(dir_name, binfile)
|
||||||
puts dir_name
|
|
||||||
file=File.open(binfile,"r")
|
file=File.open(binfile,"r")
|
||||||
bin_count = file.readlines.size
|
bin_count = file.readlines.size
|
||||||
file.close
|
file.close
|
||||||
|
@ -170,19 +209,18 @@ def generate_master_output(dir_name, binfile)
|
||||||
header = ["File", "Total Words" ] + bin_header
|
header = ["File", "Total Words" ] + bin_header
|
||||||
csv << header
|
csv << header
|
||||||
Dir.glob(dir_name + '*.json') do |file_name|
|
Dir.glob(dir_name + '*.json') do |file_name|
|
||||||
puts file_name
|
puts 'Getting data from: ' + file_name
|
||||||
csv_row = []
|
csv_row = []
|
||||||
json_file = File.read(file_name)
|
json_file = File.read(file_name)
|
||||||
data_hash = JSON.parse(json_file)
|
data_hash = JSON.parse(json_file)
|
||||||
csv_row.push(data_hash["filename"])
|
csv_row.push(data_hash["filename"])
|
||||||
|
csv_row.push(data_hash["total_words"])
|
||||||
data_hash.delete("filename")
|
data_hash.delete("filename")
|
||||||
word_total = 0
|
data_hash.delete("total_words")
|
||||||
data_hash.each_key do |key|
|
data_hash.each_key do |key|
|
||||||
csv_row.push(data_hash[key]["words"])
|
csv_row.push(data_hash[key]["words"])
|
||||||
csv_row.push(data_hash[key]["total"])
|
csv_row.push(data_hash[key]["total"])
|
||||||
word_total += data_hash[key]["total"]
|
|
||||||
end
|
end
|
||||||
csv_row = csv_row.insert(1, word_total)
|
|
||||||
csv << csv_row
|
csv << csv_row
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue