Module: TextDataTools::Column

Defined in:
lib/text-data-tools.rb

Overview

Tools for extracting data from text files where the data appears in columns with or without headers for each column.

Defined Under Namespace

Classes: DataFile, NotFoundError

Class Method Summary collapse

Class Method Details

.column_index_from_headers(line, column_header, header_match) ⇒ Object

Raises:

  • (ArgumentError)


144
145
146
147
148
149
150
151
152
# File 'lib/text-data-tools.rb', line 144

def self.column_index_from_headers(line, column_header, header_match)
	headers = line.scan(header_match)
	#p headers
	index_array = headers.map{|head| head =~ (column_header.kind_of?(Regexp) ? column_header : Regexp.new(Regexp.escape(column_header)))}
	#p index_array
	raise ArgumentError.new("column_header: #{column_header.inspect} does not match any columns in #{headers.inspect}") if index_array.compact.size == 0
	raise ArgumentError.new("column_header: #{column_header.inspect} matches more than 1 column in #{headers.inspect}") if index_array.compact.size > 1
	column_header = index_array.index(index_array.compact[0])
end

.get_1d_array(filename, has_header_line, column_header, match = /\S+/, header_match = /\S+/) ⇒ Object

Return a one-dimensional array containing data from the file filename,

which may or may not have a line of column headers,
in the column column_header, where column_header maybe either a string
or a regex which matches the title of the column,  or an integer
giving the zero-based  column number.

Match is a regexp that matches data items, and header_match is a regexp that
matches items in the headers.

All data is returned as strings

Raises:

  • (ArgumentError)


28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/text-data-tools.rb', line 28

def self.get_1d_array(filename, has_header_line, column_header, match=/\S+/, header_match=/\S+/)
	raise ArgumentError.new("column_header header should be a string, regex or integer") unless [String, Regexp, Integer].find{|cls| column_header.kind_of? cls}
	array = []
	File.open(filename) do |file|
		headers = file.gets if has_header_line
		if [String, Regexp].find{|cls| column_header.kind_of? cls}
			raise ("Header search given but has_header_line = false") if not has_header_line
			column_header = column_index_from_headers(headers, column_header, header_match)
		end
		while line = file.gets
			values = line.scan(match)
		 	array.push values[column_header]	
			#puts line
		end
	end
	array
end

.get_1d_array_float(*args) ⇒ Object

Calls get_1d_array and converts all data elements to floats



48
49
50
# File 'lib/text-data-tools.rb', line 48

def self.get_1d_array_float(*args)
	get_1d_array(*args).map{|v| v.to_f}
end

.get_1d_array_integer(*args) ⇒ Object



51
52
53
# File 'lib/text-data-tools.rb', line 51

def self.get_1d_array_integer(*args)
	get_1d_array(*args).map{|v| v.to_i}
end

.get_2d_array(filename, has_header_line, column_header, index_header = nil, match = /\S+/, header_match = /\S+/) ⇒ Object

Return a two-dimensional array containing data from the file filename,

which may or may not have a line of column headers,
in the column column_header, where column_header maybe either a string
or a regex which matches the title of the column,  or an integer
giving the zero-based  column number.

It is assumed that two-dimensional array is in one column. 
If index_header is nil, data is assumed to be separated by blank lines.
E.g.
		1.2
		4.2
		7.2

		8.2
		4.2
		2.2
If index_header is an integer or string or regexp, it selects a column
in the manner of column_header, and the data is divided by values of this
column.
E.g. 
		1  5.5
		1  3.2
		1  2.6
		2  3.2

2 2.2 2 6.3

Match is a regexp that matches data items, and header_match is a regexp that
matches items in the headers.

All data is returned as strings

Raises:

  • (ArgumentError)


86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# File 'lib/text-data-tools.rb', line 86

def self.get_2d_array(filename, has_header_line, column_header, index_header=nil, match=/\S+/, header_match=/\S+/)
	raise ArgumentError.new("column_header header should be a string, regex or integer") unless [String, Regexp, Integer].find{|cls| column_header.kind_of? cls}
	raise ArgumentError.new("index_header should be a string, regex, integer or nil") unless [String, Regexp, Integer, NilClass].find{|cls| column_header.kind_of? cls}
	array = []
	File.open(filename) do |file|
		headers = file.gets if has_header_line
		if [String, Regexp].find{|cls| column_header.kind_of? cls}
			raise ("Header search given but has_header_line = false") if not has_header_line
			column_header = column_index_from_headers(headers, column_header, header_match)
		end
		if [String, Regexp].find{|cls| index_header.kind_of? cls}
			raise ("Header search given but has_header_line = false") if not has_header_line
			index_header = column_index_from_headers(headers, index_header, header_match)
		end
		index_value = false
		index = 0
		while line = file.gets
			if index_header.nil?
				if line =~ /^\s*$/
					if array.size == 0 # ignore empty lines at top
						next
					else
						(array.push []; index+=1;next) 
					end
				end
				array.push [] if array.size = 0
			else
				next if line =~ /^\s*$/
			end
			values = line.scan(match)
			if not index_header.nil?
				if array.size ==0
					array.push []
					index_value = values[index_header]
				elsif index_value != values[index_header]
					array.push []
					index+=1
					index_value = values[index_header]
				end
			end
		 	array[index].push values[column_header]	
			#puts line
		end
	end
	array
end

.get_2d_array_float(*args) ⇒ Object

Calls get_2d_array and converts all data elements to floats



134
135
136
# File 'lib/text-data-tools.rb', line 134

def self.get_2d_array_float(*args)
	get_2d_array(*args).map{|a| a.map{|v| v.to_f}}
end

.get_2d_array_integer(*args) ⇒ Object



137
138
139
# File 'lib/text-data-tools.rb', line 137

def self.get_2d_array_integer(*args)
	get_2d_array(*args).map{|a| a.map{|v| v.to_i}}
end