fileslurp source code

1 /**
2 Generic Loader for delimited text files.
3 
4 $(LREF slurpy) is the main function to be used.
5 
6 Copyright: Copyright 2013 the authors.
7 
8 License: BSD 3-Clause
9 
10 Authors: $(WEB https://github.com/agordon/ , A. Gordon), JM
11 */
12 module fileslurp;
13 
14 import std.typetuple;
15 import std.traits;
16 import std.typecons;
17 import std.functional;
18 import std.string;
19 import std.array ;
20 import std.conv: text;
21 import std.exception;
22 import std.stdio;
23 import std.file;
24 import std.range;
25 
26 private {
27 
28 @safe pure void consume_delimiter(S, D)(ref S input_str, const D delimiter)
29 {
30     if (input_str.empty || input_str[0] != delimiter)
31 	    throw new Exception("missing delimiter");
32 
33     input_str = input_str[1..$];
34 }
35 
36 unittest
37 {
38 	string s = "\t2\t3";
39 	consume_delimiter(s,'\t');
40 	assert(s=="2\t3");
41 	//Trying to remove a delimiter when non is available is a throwable offense
42 	assertThrown!Exception(consume_delimiter(s,'\t'));
43 	//Trying to remove a delimiter from an empty string is a throwable offense
44 	s = "";
45 	assertThrown!Exception(consume_delimiter(s,' '));
46 }
47 
48 @safe S consume_string_field(S,D)(ref S input_str, const D delimiter)
49 {
50 	size_t j = input_str.length;
51 	foreach (i, dchar c; input_str)
52 	{
53 		if ( c == delimiter ) {
54 			j = i;
55 			break;
56 		}
57 	}
58 	scope(exit) input_str = input_str[j .. $];
59 	return input_str[0 .. j];
60 }
61 
62 unittest
63 {
64 	// Consume the first field
65 	string s = "hello\tworld";
66 	string t = consume_string_field(s,'\t');
67 	assert(s=="\tworld");
68 	assert(t=="hello");
69 
70 	// Consume the next (and last) field
71 	consume_delimiter(s,'\t');
72 	t = consume_string_field(s,'\t');
73 	assert(s=="");
74 	assert(t=="world");
75 
76 	// No string before delimiter - return an empty string
77 	s = "\tfoo\tbar";
78 	t = consume_string_field(s,'\t');
79 	assert(s=="\tfoo\tbar");
80 	assert(t=="");
81 
82 	// Empty string - is a valid single (empty) field
83 	s = "";
84 	t = consume_string_field(s,'\t');
85 	assert(s=="");
86 	assert(t=="");
87 
88 	// No delimiter in string - treat it as a valid single field
89 	s = "hello world";
90 	t = consume_string_field(s,'\t');
91 	assert(s=="");
92 	assert(t=="hello world");
93 }
94 
95 @safe pure S quotemeta(S)(const S s)
96 {
97 	string[dchar] meta = [ '\n' : "<LF>",
98 		'\t' : "<TAB>",
99 		'\r' : "<CR>",
100 		'\0' : "<NULL>" ];
101 
102 	return translate(s,meta);
103 }
104 
105 unittest
106 {
107 	string s="1\t2\t3\n";
108 	auto t = quotemeta(s);
109 	assert(t=="1<TAB>2<TAB>3<LF>");
110 
111 	//String with null
112 	s="1\0002";
113 	t = quotemeta(s);
114 	assert(t=="1<NULL>2");
115 
116 	//Empty string
117 	s="";
118 	t = quotemeta(s);
119 	assert(t=="");
120 
121 	// Normal string
122 	s="1\\t2";
123 	t = quotemeta(s);
124 	assert(t=="1\\t2");
125 }
126 
127 @safe pure string quotemeta(const char c)
128 {
129 	string[dchar] meta = [ '\n' : "<LF>",
130 		'\t' : "<TAB>",
131 		'\r' : "<CR>",
132 		'\0' : "<NULL>" ];
133 	if (c in meta)
134 		return meta[c];
135 
136 	return [c];
137 }
138 
139 unittest
140 {
141 	assert(quotemeta('\t')=="<TAB>");
142 	assert(quotemeta('\r')=="<CR>");
143 	assert(quotemeta('\n')=="<LF>");
144 	assert(quotemeta('\00')=="<NULL>");
145 	assert(quotemeta('t')=="t");
146 }
147 
148 } // private
149 
150 
151 /**
152 Parses string $(D input), delimited by character $(D delimiter), into a tuple of variables $(arg).
153 
154 Returns:
155 On success, the function returns nothing (void), and all the members of the tuple are populated.
156 
157 Throws:
158 $(XREF std.exception.Exception) on failure to correctly parse the string.   
159 
160 Example:
161 ----
162 string s = "Hello World 42";
163 Tuple!(string,string,int) t;
164 parse_delimited_string(s,' ',t);
165 assert(t[0]=="Hello");
166 assert(t[1]=="World");
167 assert(t[2]==42);
168 ----
169 
170 Notes:
171 $(OL
172 	$(LI Parsing is much stricter (and less tolerant) than $(XREF std.format.formattedRead))
173 	$(LI White-space is never automatically skipped)
174 	$(LI A space delimiter consume only space character (ASCII 20), not TAB (ASCII 9))
175 	$(LI Multiple consecutive delimiters are not consumed as one delimiter (e.g. "1\t\t\t2" is considerd a string with four fields - it has three delimiters. It will throw an exception because empty fields are not allowed).)
176 	$(LI All fields must exist (i.e. if the tuple $(D arg) has 3 members, the $(D input) string must contain two delimiters and three valid values))
177 	$(LI For a string field, empty values are not acceptable, will throw an exception)
178 	$(LI Extra characters at the end of a field or the line will throw an exception)
179 )
180 
181 */
182 @safe void parse_delimited_string(DATA)(const string input, const char delimiter, ref DATA arg)
183 {
184 	string remaining_input = input;
185 
186 	foreach (i, T; DATA.Types)
187 	{
188 		//TODO: Handle other types (for now, only numeric or strings)
189 		static if (isNumeric!T) {
190 			try {
191 				// consume a numeric field
192 				arg[i] = std.conv.parse!T(remaining_input);
193 			} catch ( std.conv.ConvException e ) {
194 				throw new Exception(text("failed to parse numeric value in field ", i+1,
195 							" (text is '",quotemeta(remaining_input),"')"));
196 			}
197 		} else 	{
198 			// consume a string field
199 			arg[i] = consume_string_field(remaining_input,delimiter);
200 			if (arg[i].empty)
201 				throw new Exception(text("empty text at field ", i+1,
202 							" (remaining text is '",quotemeta(remaining_input),"')"));
203 		}
204 
205 		static if (i<DATA.length-1) {
206 			//Not the last field - require more input
207 			if (remaining_input.empty)
208 				throw new Exception(text("input terminated too soon (expecting ",
209 							DATA.length," fields, got ", i+1, ")"));
210 
211 			//Following the converted value of this field,
212 			//require a delimiter (to prevent extra characters, even whitespace)
213 			if (remaining_input[0] != delimiter)
214 				throw new Exception(text("extra characters in field ",i+1,
215 							" (starting at '",quotemeta(remaining_input),"')"));
216 			consume_delimiter(remaining_input,delimiter);
217 		} else {
218 			// Last field: check for extra input
219 			if (!remaining_input.empty)
220 				throw new Exception(text("extra characters in last field ",i+1,
221 							" (starting at '",quotemeta(remaining_input),"')"));
222 		}
223 		
224 	}
225 }
226 
227 unittest
228 {
229 	Tuple!(int,string,int) a;
230 	parse_delimited_string("1 2 3",' ',a);
231 	assert(a[0]==1 && a[1]=="2" && a[2]==3);
232 
233 	parse_delimited_string("1\t2\t3",'\t',a);
234 	assert(a[0]==1 && a[1]=="2" && a[2]==3);
235 
236 	//Extra delimiter at the end of the line is not OK
237 	assertThrown!Exception(parse_delimited_string("1 2 3 ",' ',a));
238 
239 	//Invalid number on first field (parse!int should fail)
240 	assertThrown!Exception(parse_delimited_string(".1 2 3",' ',a));
241 
242 	//Extra characters in field 1 (After successfull parse!int)
243 	assertThrown!Exception(parse_delimited_string("1. 2 3",' ',a));
244 
245 	//Line contains too many fields
246 	assertThrown!Exception(parse_delimited_string("1 2 3 4",' ',a));
247 
248 	//Line is too short
249 	assertThrown!Exception(parse_delimited_string("1 2",' ',a));
250 
251 	//non-space/tab delimiter is fine
252 	parse_delimited_string("1|2|3",'|',a);
253 	assert(a[0]==1 && a[1]=="2" && a[2]==3);
254 	parse_delimited_string("1|  2  |3",'|',a);
255 	assert(a[0]==1 && a[1]=="  2  " && a[2]==3);
256 
257 	//Spaces are bad (and not ignored) if delimiter is not space (for numeric fields)
258 	assertThrown!Exception(parse_delimited_string("1 |2|3",'|',a));
259 	assertThrown!Exception(parse_delimited_string(" 1|2|3",'|',a));
260 	assertThrown!Exception(parse_delimited_string(" 1|2| 3",'|',a));
261 	assertThrown!Exception(parse_delimited_string("1|2|3 ",'|',a));
262 
263 	//For string fields, empty values are not OK (different from formattedRead())
264 	assertThrown!Exception(parse_delimited_string("1||3",'|',a));
265 
266 	//For string fields, last value can't be empty (different from formattedRead())
267 	Tuple!(int,string,string) b;
268 	assertThrown!Exception(parse_delimited_string("1|2|",'|',b));
269 
270 	//One field is OK
271 	Tuple!(string) c;
272 	parse_delimited_string("foo",' ',c);
273 	assert(c[0]=="foo");
274 
275 	//Fields that are OK for floating-point types should not work for integers (extra characters)
276 	Tuple!(real,int) d;
277 	parse_delimited_string("4.5 9",' ',d);
278 	assert(d[0]==4.5 && d[1]==9);
279 	Tuple!(int,real) e;
280 	assertThrown!Exception(parse_delimited_string("4.5 9",' ',e));
281 
282 	//scientific notation - OK for floating-point types
283 	Tuple!(double,double) f;
284 	parse_delimited_string("-0.004e3 +4.3e10",' ',f);
285 	assert(f[0]==-0.004e3 && f[1]==43e9);
286 
287 	//Scientific notation - fails for integars
288 	Tuple!(int,int) g;
289 	assertThrown!Exception(parse_delimited_string("-0.004e3 +4.3e10",' ',g));
290 }
291 
292 
293 /**
294 Loads a delimited text file, line-by-line, parses the line into fields, and calls a delegate/function for each line.
295 
296 Returns:
297 On success, the function returns nothing (void), the call back function have been called for every line.
298 
299 Throws:
300 $(XREF std.exception.Exception) on failure to correctly parse a line.
301 $(XREF std.file.FileException) on I/O failures.
302 
303 Example:
304 ----
305 // Load a text file with three numeric columns,
306 // Store the tuple in an array
307 // (NOTE: this is a naive, inefficient way to populate an array, see NOTES)
308 alias Tuple!(int,int,int) T;
309 T[] t;
310 slurpy!( T,           // The number and types of the (expected) fields in the file
311 	 delegate(x) { t ~= x; }, // for each line read, call this function. X will be of type T.
312 	 '\t'         // The delimiter (default = TAB)
313        )("file.txt"); // The file name to read.
314 ----
315 
316 Example:
317 ----
318 // Load a text file with three numeric columns,
319 // Use the second column as a KEY and the third column as the VALUE.
320 alias Tuple!(int,int,int) T;
321 int[int] data;
322 slurpy!( T,              // The number and types of the (expected) fields in the file
323 	 delegate(x) {   // for each line read, call this function. X will be of type T.
324 	     data[x[1]] = x[2] ;
325 	 },	
326 	 '\t'             // The delimiter (default = TAB)
327        )("file.txt");    // The file name to read.
328 ----
329 
330 Notes:
331 $(OL
332 	$(LI See $(LREF parse_delimited_string) for details about parsing the delimited lines of the fiile)
333 	$(LO
334 )
335 
336 */
337 void slurpy(MEMBERS, alias STORE_FUNCTION, char delimiter='\t')(const string filename)
338 {
339 	static assert (isTuple!MEMBERS,"slurpy: 1st template parameter must be a Tuple with the expected columns in the file");
340 	
341 	auto f = File(filename);
342 	scope(exit) f.close();
343 	auto lines=0;
344 
345 	alias unaryFun!STORE_FUNCTION _Fun;
346 	MEMBERS data;
347 
348 	foreach (origline; f.byLineFast())
349 	{
350 		++lines;
351 		string line = origline.idup;
352 		try {
353 			parse_delimited_string(line, delimiter, data);
354 			_Fun(data);
355 		} catch ( Exception e ) {
356 			throw new FileException(filename,text("invalid input at line ", lines,
357 						": expected ", data.tupleof.length,
358 						" fields ",typeof(data.tupleof).stringof,
359 						" delimiter by '",quotemeta(delimiter),
360 						"' got '", origline,
361 						"' error details: ", e.msg ));
362 		}
363 	}
364 }
365 
366 unittest
367 {
368 	import std.file ;
369 	auto deleteme = testFilename();
370 	write(deleteme,"1 2 3\n4 5 6\n");
371 	scope(exit) { assert(exists(deleteme)); remove(deleteme); }
372 
373 	//Load a text file, with three fields, delimiter with spaces.
374 	alias Tuple!(int,int,int) T;
375 	T[] t;
376 	slurpy!( T,         // The number and types of the (expected) fields in the file
377 		 delegate(x) { t ~= x; }, // for each line read, call this function. X will be of type T.
378 		 ' '        // The delimiter (default = TAB)
379 	       )(deleteme); // The file name to read.
380 	assert(t.length==2);
381 	assert(t[0] == tuple(1,2,3));
382 	assert(t[1] == tuple(4,5,6));
383 
384 	//Any kind of invalid data should throw an exception
385 	//NOTE: the delegate function does nothing, because we don't care about the data
386 	//      in this test.
387 	//NOTE: see more test cases for failed parsing in the unittest of 'parse_delimited_string'.
388 	auto deleteme2 = testFilename() ~ ".2";
389 	write(deleteme2,"1 Foo 3\n4 5 6\n"); // conversion will fail in the first line
390 	scope(exit) { assert(exists(deleteme2)); remove(deleteme2); }
391 	assertThrown!Exception( slurpy!( T, (x) => {}, ' ')(deleteme2)) ;
392 }
393 
394 
395 /**
396 Loads a delimited text file, line-by-line, parses the line into fields, returns an array of fields.
397 
398 Returns:
399 On success, returns an array of tuples, based on template parameters.
400 
401 Throws:
402 $(XREF std.exception.Exception) on failure to correctly parse a line.
403 $(XREF std.file.FileException) on I/O failures.
404 
405 Example:
406 ----
407 // Load a text file, tab-delimited, with three numeric columns.
408 
409 auto data = slurpy_array!('\t', int,int,int)("file.txt");
410 
411 // data[0] will be of type Tuple!(int,int,int)
412 ----
413 */
414 Select!(Types.length == 1, Types[0][], Tuple!(Types)[])
415 slurpy_array(char delimiter, Types...)(string filename)
416 {
417 	alias RetT = typeof(return);
418 	
419 	RetT result;
420 	Appender!RetT app;
421 	alias MEMBERS = ElementType!RetT;
422 
423 	slurpy! ( MEMBERS, x => app.put(x) , delimiter ) (filename);
424 
425 	return app.data;
426 }
427 
428 unittest
429 {
430 	import std.file ;
431 	auto deleteme = testFilename() ~ ".3";
432 	write(deleteme,"1 2 3\n4 5 6\n");
433 	scope(exit) { assert(exists(deleteme)); remove(deleteme); }
434 
435 	//Load a text file, with three fields, delimiter with spaces.
436 	auto t = slurpy_array!( ' ', // delimiter
437 			        int, int, int // expected fields in the text file
438 			      )(deleteme);
439 	assert(t.length==2);
440 	assert(t[0] == tuple(1,2,3));
441 	assert(t[1] == tuple(4,5,6));
442 }
443 
444 version(unittest) string testFilename(string file = __FILE__, size_t line = __LINE__)
445 {
446 	import std.path;
447 	import std.process;
448 	return text("deleteme-.", getpid(), ".", baseName(file), ".", line);
449 }
450 
451 /*
452 On Thursday, 16 May 2013 at 10:35:12 UTC, Dicebot wrote:
453 > Want to bring into discussion people that are not on Google+.
454 > Samuel recently has posted there some simple experiments with
455 > bioinformatics and bad performance of Phobos-based snippet has
456 > surprised me.
457 >
458 > I did explore issue a bit and reported results in a blog post
459 > (snippets are really small and simple) :
460 > http://dicebot.blogspot.com/2013/05/short-performance-tuning-story.html
461 >
462 > One open question remains though - can D/Phobos do better here?
463 > Can some changes be done to Phobos functions in question to
464 > improve performance or creating bioinformatics-specialized
465 > library is only practical solution?
466 
467 I bet the problem is in readln. Currently, File.byLine() and
468 readln() are extremely slow, because they call fgetc() one char
469 at a time.
470 
471 I made an "byLineFast" implementation some time ago that is 10x
472 faster than std.stdio.byLine. It reads lines through rawRead, and
473 using buffers instead of char by char.
474 
475 I don't have the time to make it phobos-ready (unicode, etc.).
476 But I'll paste it here for any one to use (it works perfectly).
477 
478 --jm
479 */
480 
481 
482 import std.stdio;
483 import std.string: indexOf;
484 import core.stdc.string: memmove;
485 
486 /**
487    Reads by line in an efficient way (10 times faster than File.byLine
488    from std.stdio).
489    This is accomplished by reading entire buffers (fgetc() is not used),
490    and allocating as little as possible.
491 
492    The char \n is considered as separator, removing the previous \r
493    if it exists.
494 
495    The \n is never returned. The \r is not returned if it was
496    part of a \r\n (but it is returned if it was by itself).
497 
498    The returned string is always a substring of a temporary
499    buffer, that must not be stored. If necessary, you must
500    use str[] or .dup or .idup to copy to another string.
501 
502    Example:
503 
504          File f = File("file.txt");
505          foreach (string line; f.byLineFast) {
506              ...process line...
507              //Make a copy:
508              string copy = line[];
509          }
510 
511    The file isn't closed when done iterating, unless it was
512    the only reference to the file (same as std.stdio.byLine).
513    (example: ByLineFast(File("file.txt"))).
514 */
515 struct byLineFast {
516      File file;
517      char[] line;
518      bool first_call = true;
519      char[] buffer;
520      char[] strBuffer;
521 
522      this(File f, int bufferSize=4096) {
523          assert(bufferSize > 0);
524          file = f;
525          buffer.length = bufferSize;
526      }
527 
528       @property bool empty() const {
529          //Its important to check "line !is null" instead of
530          //"line.length != 0", otherwise, no empty lines can
531          //be returned, the iteration would be closed.
532          if (line.ptr !is null) {
533              return false;
534          }
535          if (!file.isOpen) {
536              //Clean the buffer to avoid pointer false positives:
537             (cast(char[])buffer)[] = 0;
538              return true;
539          }
540 
541          //First read. Determine if it's empty and put the char back.
542          auto mutableFP = (cast(File*) &file).getFP();
543          auto c = fgetc(mutableFP);
544          if (c == -1) {
545              //Clean the buffer to avoid pointer false positives:
546             (cast(char[])buffer)[] = 0;
547              return true;
548          }
549          if (ungetc(c, mutableFP) != c) {
550              assert(false, "Bug in cstdlib implementation");
551          }
552          return false;
553      }
554 
555       @property char[] front() {
556          if (first_call) {
557              popFront();
558              first_call = false;
559          }
560          return line;
561      }
562 
563      void popFront() {
564          if (strBuffer.length == 0) {
565              strBuffer = file.rawRead(buffer);
566              if (strBuffer.length == 0) {
567                  file.detach();
568                  line = null;
569                  return;
570              }
571          }
572 
573          ulong pos = strBuffer.indexOf('\n');
574          if (pos != -1) {
575              if (pos != 0 && strBuffer[pos-1] == '\r') {
576                  line = strBuffer[0 .. (pos-1)];
577              } else {
578                  line = strBuffer[0 .. pos];
579              }
580              //Pop the line, skipping the terminator:
581              strBuffer = strBuffer[(pos+1) .. $];
582          } else {
583              //More needs to be read here. Copy the tail of the buffer
584              //to the beginning, and try to read with the empty part of
585              //the buffer.
586              //If no buffer was left, extend the size of the buffer before
587              //reading. If the file has ended, then the line is the entire
588              //buffer.
589 
590              if (strBuffer.ptr != buffer.ptr) {
591                  //Must use memmove because there might be overlap
592                  memmove(buffer.ptr, strBuffer.ptr, strBuffer.length * char.sizeof);
593              }
594              ulong spaceBegin = strBuffer.length;
595              if (strBuffer.length == buffer.length) {
596                  //Must extend the buffer to keep reading.
597                  assumeSafeAppend(buffer);
598                  buffer.length = buffer.length * 2;
599              }
600              char[] readPart = file.rawRead(buffer[spaceBegin .. $]);
601              if (readPart.length == 0) {
602                  //End of the file. Return whats in the buffer.
603                  //The next popFront() will try to read again, and then
604                  //mark empty condition.
605                  if (spaceBegin != 0 && buffer[spaceBegin-1] == '\r') {
606                      line = buffer[0 .. spaceBegin-1];
607                  } else {
608                      line = buffer[0 .. spaceBegin];
609                  }
610                  strBuffer = null;
611                  return;
612              }
613              strBuffer = buffer[0 .. spaceBegin + readPart.length];
614              //Now that we have new data in strBuffer, we can go on.
615              //If a line isn't found, the buffer will be extended again to read more.
616              popFront();
617          }
618      }
619 }