1 /** 2 Generic Loader for delimited text files. 3 4 $(LREF slurpy) is the main function to be used. 5 6 Copyright: Copyright 2013 the authors. 7 8 License: BSD 3-Clause 9 10 Authors: $(WEB https://github.com/agordon/ , A. Gordon), JM 11 */ 12 module fileslurp; 13 14 import std.typetuple; 15 import std.traits; 16 import std.typecons; 17 import std.functional; 18 import std.string; 19 import std.array ; 20 import std.conv: text; 21 import std.exception; 22 import std.stdio; 23 import std.file; 24 import std.range; 25 26 private { 27 28 @safe pure void consume_delimiter(S, D)(ref S input_str, const D delimiter) 29 { 30 if (input_str.empty || input_str[0] != delimiter) 31 throw new Exception("missing delimiter"); 32 33 input_str = input_str[1..$]; 34 } 35 36 unittest 37 { 38 string s = "\t2\t3"; 39 consume_delimiter(s,'\t'); 40 assert(s=="2\t3"); 41 //Trying to remove a delimiter when non is available is a throwable offense 42 assertThrown!Exception(consume_delimiter(s,'\t')); 43 //Trying to remove a delimiter from an empty string is a throwable offense 44 s = ""; 45 assertThrown!Exception(consume_delimiter(s,' ')); 46 } 47 48 @safe S consume_string_field(S,D)(ref S input_str, const D delimiter) 49 { 50 size_t j = input_str.length; 51 foreach (i, dchar c; input_str) 52 { 53 if ( c == delimiter ) { 54 j = i; 55 break; 56 } 57 } 58 scope(exit) input_str = input_str[j .. $]; 59 return input_str[0 .. j]; 60 } 61 62 unittest 63 { 64 // Consume the first field 65 string s = "hello\tworld"; 66 string t = consume_string_field(s,'\t'); 67 assert(s=="\tworld"); 68 assert(t=="hello"); 69 70 // Consume the next (and last) field 71 consume_delimiter(s,'\t'); 72 t = consume_string_field(s,'\t'); 73 assert(s==""); 74 assert(t=="world"); 75 76 // No string before delimiter - return an empty string 77 s = "\tfoo\tbar"; 78 t = consume_string_field(s,'\t'); 79 assert(s=="\tfoo\tbar"); 80 assert(t==""); 81 82 // Empty string - is a valid single (empty) field 83 s = ""; 84 t = consume_string_field(s,'\t'); 85 assert(s==""); 86 assert(t==""); 87 88 // No delimiter in string - treat it as a valid single field 89 s = "hello world"; 90 t = consume_string_field(s,'\t'); 91 assert(s==""); 92 assert(t=="hello world"); 93 } 94 95 @safe pure S quotemeta(S)(const S s) 96 { 97 string[dchar] meta = [ '\n' : "<LF>", 98 '\t' : "<TAB>", 99 '\r' : "<CR>", 100 '\0' : "<NULL>" ]; 101 102 return translate(s,meta); 103 } 104 105 unittest 106 { 107 string s="1\t2\t3\n"; 108 auto t = quotemeta(s); 109 assert(t=="1<TAB>2<TAB>3<LF>"); 110 111 //String with null 112 s="1\0002"; 113 t = quotemeta(s); 114 assert(t=="1<NULL>2"); 115 116 //Empty string 117 s=""; 118 t = quotemeta(s); 119 assert(t==""); 120 121 // Normal string 122 s="1\\t2"; 123 t = quotemeta(s); 124 assert(t=="1\\t2"); 125 } 126 127 @safe pure string quotemeta(const char c) 128 { 129 string[dchar] meta = [ '\n' : "<LF>", 130 '\t' : "<TAB>", 131 '\r' : "<CR>", 132 '\0' : "<NULL>" ]; 133 if (c in meta) 134 return meta[c]; 135 136 return [c]; 137 } 138 139 unittest 140 { 141 assert(quotemeta('\t')=="<TAB>"); 142 assert(quotemeta('\r')=="<CR>"); 143 assert(quotemeta('\n')=="<LF>"); 144 assert(quotemeta('\00')=="<NULL>"); 145 assert(quotemeta('t')=="t"); 146 } 147 148 } // private 149 150 151 /** 152 Parses string $(D input), delimited by character $(D delimiter), into a tuple of variables $(arg). 153 154 Returns: 155 On success, the function returns nothing (void), and all the members of the tuple are populated. 156 157 Throws: 158 $(XREF std.exception.Exception) on failure to correctly parse the string. 159 160 Example: 161 ---- 162 string s = "Hello World 42"; 163 Tuple!(string,string,int) t; 164 parse_delimited_string(s,' ',t); 165 assert(t[0]=="Hello"); 166 assert(t[1]=="World"); 167 assert(t[2]==42); 168 ---- 169 170 Notes: 171 $(OL 172 $(LI Parsing is much stricter (and less tolerant) than $(XREF std.format.formattedRead)) 173 $(LI White-space is never automatically skipped) 174 $(LI A space delimiter consume only space character (ASCII 20), not TAB (ASCII 9)) 175 $(LI Multiple consecutive delimiters are not consumed as one delimiter (e.g. "1\t\t\t2" is considerd a string with four fields - it has three delimiters. It will throw an exception because empty fields are not allowed).) 176 $(LI All fields must exist (i.e. if the tuple $(D arg) has 3 members, the $(D input) string must contain two delimiters and three valid values)) 177 $(LI For a string field, empty values are not acceptable, will throw an exception) 178 $(LI Extra characters at the end of a field or the line will throw an exception) 179 ) 180 181 */ 182 @safe void parse_delimited_string(DATA)(const string input, const char delimiter, ref DATA arg) 183 { 184 string remaining_input = input; 185 186 foreach (i, T; DATA.Types) 187 { 188 //TODO: Handle other types (for now, only numeric or strings) 189 static if (isNumeric!T) { 190 try { 191 // consume a numeric field 192 arg[i] = std.conv.parse!T(remaining_input); 193 } catch ( std.conv.ConvException e ) { 194 throw new Exception(text("failed to parse numeric value in field ", i+1, 195 " (text is '",quotemeta(remaining_input),"')")); 196 } 197 } else { 198 // consume a string field 199 arg[i] = consume_string_field(remaining_input,delimiter); 200 if (arg[i].empty) 201 throw new Exception(text("empty text at field ", i+1, 202 " (remaining text is '",quotemeta(remaining_input),"')")); 203 } 204 205 static if (i<DATA.length-1) { 206 //Not the last field - require more input 207 if (remaining_input.empty) 208 throw new Exception(text("input terminated too soon (expecting ", 209 DATA.length," fields, got ", i+1, ")")); 210 211 //Following the converted value of this field, 212 //require a delimiter (to prevent extra characters, even whitespace) 213 if (remaining_input[0] != delimiter) 214 throw new Exception(text("extra characters in field ",i+1, 215 " (starting at '",quotemeta(remaining_input),"')")); 216 consume_delimiter(remaining_input,delimiter); 217 } else { 218 // Last field: check for extra input 219 if (!remaining_input.empty) 220 throw new Exception(text("extra characters in last field ",i+1, 221 " (starting at '",quotemeta(remaining_input),"')")); 222 } 223 224 } 225 } 226 227 unittest 228 { 229 Tuple!(int,string,int) a; 230 parse_delimited_string("1 2 3",' ',a); 231 assert(a[0]==1 && a[1]=="2" && a[2]==3); 232 233 parse_delimited_string("1\t2\t3",'\t',a); 234 assert(a[0]==1 && a[1]=="2" && a[2]==3); 235 236 //Extra delimiter at the end of the line is not OK 237 assertThrown!Exception(parse_delimited_string("1 2 3 ",' ',a)); 238 239 //Invalid number on first field (parse!int should fail) 240 assertThrown!Exception(parse_delimited_string(".1 2 3",' ',a)); 241 242 //Extra characters in field 1 (After successfull parse!int) 243 assertThrown!Exception(parse_delimited_string("1. 2 3",' ',a)); 244 245 //Line contains too many fields 246 assertThrown!Exception(parse_delimited_string("1 2 3 4",' ',a)); 247 248 //Line is too short 249 assertThrown!Exception(parse_delimited_string("1 2",' ',a)); 250 251 //non-space/tab delimiter is fine 252 parse_delimited_string("1|2|3",'|',a); 253 assert(a[0]==1 && a[1]=="2" && a[2]==3); 254 parse_delimited_string("1| 2 |3",'|',a); 255 assert(a[0]==1 && a[1]==" 2 " && a[2]==3); 256 257 //Spaces are bad (and not ignored) if delimiter is not space (for numeric fields) 258 assertThrown!Exception(parse_delimited_string("1 |2|3",'|',a)); 259 assertThrown!Exception(parse_delimited_string(" 1|2|3",'|',a)); 260 assertThrown!Exception(parse_delimited_string(" 1|2| 3",'|',a)); 261 assertThrown!Exception(parse_delimited_string("1|2|3 ",'|',a)); 262 263 //For string fields, empty values are not OK (different from formattedRead()) 264 assertThrown!Exception(parse_delimited_string("1||3",'|',a)); 265 266 //For string fields, last value can't be empty (different from formattedRead()) 267 Tuple!(int,string,string) b; 268 assertThrown!Exception(parse_delimited_string("1|2|",'|',b)); 269 270 //One field is OK 271 Tuple!(string) c; 272 parse_delimited_string("foo",' ',c); 273 assert(c[0]=="foo"); 274 275 //Fields that are OK for floating-point types should not work for integers (extra characters) 276 Tuple!(real,int) d; 277 parse_delimited_string("4.5 9",' ',d); 278 assert(d[0]==4.5 && d[1]==9); 279 Tuple!(int,real) e; 280 assertThrown!Exception(parse_delimited_string("4.5 9",' ',e)); 281 282 //scientific notation - OK for floating-point types 283 Tuple!(double,double) f; 284 parse_delimited_string("-0.004e3 +4.3e10",' ',f); 285 assert(f[0]==-0.004e3 && f[1]==43e9); 286 287 //Scientific notation - fails for integars 288 Tuple!(int,int) g; 289 assertThrown!Exception(parse_delimited_string("-0.004e3 +4.3e10",' ',g)); 290 } 291 292 293 /** 294 Loads a delimited text file, line-by-line, parses the line into fields, and calls a delegate/function for each line. 295 296 Returns: 297 On success, the function returns nothing (void), the call back function have been called for every line. 298 299 Throws: 300 $(XREF std.exception.Exception) on failure to correctly parse a line. 301 $(XREF std.file.FileException) on I/O failures. 302 303 Example: 304 ---- 305 // Load a text file with three numeric columns, 306 // Store the tuple in an array 307 // (NOTE: this is a naive, inefficient way to populate an array, see NOTES) 308 alias Tuple!(int,int,int) T; 309 T[] t; 310 slurpy!( T, // The number and types of the (expected) fields in the file 311 delegate(x) { t ~= x; }, // for each line read, call this function. X will be of type T. 312 '\t' // The delimiter (default = TAB) 313 )("file.txt"); // The file name to read. 314 ---- 315 316 Example: 317 ---- 318 // Load a text file with three numeric columns, 319 // Use the second column as a KEY and the third column as the VALUE. 320 alias Tuple!(int,int,int) T; 321 int[int] data; 322 slurpy!( T, // The number and types of the (expected) fields in the file 323 delegate(x) { // for each line read, call this function. X will be of type T. 324 data[x[1]] = x[2] ; 325 }, 326 '\t' // The delimiter (default = TAB) 327 )("file.txt"); // The file name to read. 328 ---- 329 330 Notes: 331 $(OL 332 $(LI See $(LREF parse_delimited_string) for details about parsing the delimited lines of the fiile) 333 $(LO 334 ) 335 336 */ 337 void slurpy(MEMBERS, alias STORE_FUNCTION, char delimiter='\t')(const string filename) 338 { 339 static assert (isTuple!MEMBERS,"slurpy: 1st template parameter must be a Tuple with the expected columns in the file"); 340 341 auto f = File(filename); 342 scope(exit) f.close(); 343 auto lines=0; 344 345 alias unaryFun!STORE_FUNCTION _Fun; 346 MEMBERS data; 347 348 foreach (origline; f.byLineFast()) 349 { 350 ++lines; 351 string line = origline.idup; 352 try { 353 parse_delimited_string(line, delimiter, data); 354 _Fun(data); 355 } catch ( Exception e ) { 356 throw new FileException(filename,text("invalid input at line ", lines, 357 ": expected ", data.tupleof.length, 358 " fields ",typeof(data.tupleof).stringof, 359 " delimiter by '",quotemeta(delimiter), 360 "' got '", origline, 361 "' error details: ", e.msg )); 362 } 363 } 364 } 365 366 unittest 367 { 368 import std.file ; 369 auto deleteme = testFilename(); 370 write(deleteme,"1 2 3\n4 5 6\n"); 371 scope(exit) { assert(exists(deleteme)); remove(deleteme); } 372 373 //Load a text file, with three fields, delimiter with spaces. 374 alias Tuple!(int,int,int) T; 375 T[] t; 376 slurpy!( T, // The number and types of the (expected) fields in the file 377 delegate(x) { t ~= x; }, // for each line read, call this function. X will be of type T. 378 ' ' // The delimiter (default = TAB) 379 )(deleteme); // The file name to read. 380 assert(t.length==2); 381 assert(t[0] == tuple(1,2,3)); 382 assert(t[1] == tuple(4,5,6)); 383 384 //Any kind of invalid data should throw an exception 385 //NOTE: the delegate function does nothing, because we don't care about the data 386 // in this test. 387 //NOTE: see more test cases for failed parsing in the unittest of 'parse_delimited_string'. 388 auto deleteme2 = testFilename() ~ ".2"; 389 write(deleteme2,"1 Foo 3\n4 5 6\n"); // conversion will fail in the first line 390 scope(exit) { assert(exists(deleteme2)); remove(deleteme2); } 391 assertThrown!Exception( slurpy!( T, (x) => {}, ' ')(deleteme2)) ; 392 } 393 394 395 /** 396 Loads a delimited text file, line-by-line, parses the line into fields, returns an array of fields. 397 398 Returns: 399 On success, returns an array of tuples, based on template parameters. 400 401 Throws: 402 $(XREF std.exception.Exception) on failure to correctly parse a line. 403 $(XREF std.file.FileException) on I/O failures. 404 405 Example: 406 ---- 407 // Load a text file, tab-delimited, with three numeric columns. 408 409 auto data = slurpy_array!('\t', int,int,int)("file.txt"); 410 411 // data[0] will be of type Tuple!(int,int,int) 412 ---- 413 */ 414 Select!(Types.length == 1, Types[0][], Tuple!(Types)[]) 415 slurpy_array(char delimiter, Types...)(string filename) 416 { 417 alias RetT = typeof(return); 418 419 RetT result; 420 Appender!RetT app; 421 alias MEMBERS = ElementType!RetT; 422 423 slurpy! ( MEMBERS, x => app.put(x) , delimiter ) (filename); 424 425 return app.data; 426 } 427 428 unittest 429 { 430 import std.file ; 431 auto deleteme = testFilename() ~ ".3"; 432 write(deleteme,"1 2 3\n4 5 6\n"); 433 scope(exit) { assert(exists(deleteme)); remove(deleteme); } 434 435 //Load a text file, with three fields, delimiter with spaces. 436 auto t = slurpy_array!( ' ', // delimiter 437 int, int, int // expected fields in the text file 438 )(deleteme); 439 assert(t.length==2); 440 assert(t[0] == tuple(1,2,3)); 441 assert(t[1] == tuple(4,5,6)); 442 } 443 444 version(unittest) string testFilename(string file = __FILE__, size_t line = __LINE__) 445 { 446 import std.path; 447 import std.process; 448 return text("deleteme-.", getpid(), ".", baseName(file), ".", line); 449 } 450 451 /* 452 On Thursday, 16 May 2013 at 10:35:12 UTC, Dicebot wrote: 453 > Want to bring into discussion people that are not on Google+. 454 > Samuel recently has posted there some simple experiments with 455 > bioinformatics and bad performance of Phobos-based snippet has 456 > surprised me. 457 > 458 > I did explore issue a bit and reported results in a blog post 459 > (snippets are really small and simple) : 460 > http://dicebot.blogspot.com/2013/05/short-performance-tuning-story.html 461 > 462 > One open question remains though - can D/Phobos do better here? 463 > Can some changes be done to Phobos functions in question to 464 > improve performance or creating bioinformatics-specialized 465 > library is only practical solution? 466 467 I bet the problem is in readln. Currently, File.byLine() and 468 readln() are extremely slow, because they call fgetc() one char 469 at a time. 470 471 I made an "byLineFast" implementation some time ago that is 10x 472 faster than std.stdio.byLine. It reads lines through rawRead, and 473 using buffers instead of char by char. 474 475 I don't have the time to make it phobos-ready (unicode, etc.). 476 But I'll paste it here for any one to use (it works perfectly). 477 478 --jm 479 */ 480 481 482 import std.stdio; 483 import std.string: indexOf; 484 import core.stdc.string: memmove; 485 486 /** 487 Reads by line in an efficient way (10 times faster than File.byLine 488 from std.stdio). 489 This is accomplished by reading entire buffers (fgetc() is not used), 490 and allocating as little as possible. 491 492 The char \n is considered as separator, removing the previous \r 493 if it exists. 494 495 The \n is never returned. The \r is not returned if it was 496 part of a \r\n (but it is returned if it was by itself). 497 498 The returned string is always a substring of a temporary 499 buffer, that must not be stored. If necessary, you must 500 use str[] or .dup or .idup to copy to another string. 501 502 Example: 503 504 File f = File("file.txt"); 505 foreach (string line; f.byLineFast) { 506 ...process line... 507 //Make a copy: 508 string copy = line[]; 509 } 510 511 The file isn't closed when done iterating, unless it was 512 the only reference to the file (same as std.stdio.byLine). 513 (example: ByLineFast(File("file.txt"))). 514 */ 515 struct byLineFast { 516 File file; 517 char[] line; 518 bool first_call = true; 519 char[] buffer; 520 char[] strBuffer; 521 522 this(File f, int bufferSize=4096) { 523 assert(bufferSize > 0); 524 file = f; 525 buffer.length = bufferSize; 526 } 527 528 @property bool empty() const { 529 //Its important to check "line !is null" instead of 530 //"line.length != 0", otherwise, no empty lines can 531 //be returned, the iteration would be closed. 532 if (line.ptr !is null) { 533 return false; 534 } 535 if (!file.isOpen) { 536 //Clean the buffer to avoid pointer false positives: 537 (cast(char[])buffer)[] = 0; 538 return true; 539 } 540 541 //First read. Determine if it's empty and put the char back. 542 auto mutableFP = (cast(File*) &file).getFP(); 543 auto c = fgetc(mutableFP); 544 if (c == -1) { 545 //Clean the buffer to avoid pointer false positives: 546 (cast(char[])buffer)[] = 0; 547 return true; 548 } 549 if (ungetc(c, mutableFP) != c) { 550 assert(false, "Bug in cstdlib implementation"); 551 } 552 return false; 553 } 554 555 @property char[] front() { 556 if (first_call) { 557 popFront(); 558 first_call = false; 559 } 560 return line; 561 } 562 563 void popFront() { 564 if (strBuffer.length == 0) { 565 strBuffer = file.rawRead(buffer); 566 if (strBuffer.length == 0) { 567 file.detach(); 568 line = null; 569 return; 570 } 571 } 572 573 ulong pos = strBuffer.indexOf('\n'); 574 if (pos != -1) { 575 if (pos != 0 && strBuffer[pos-1] == '\r') { 576 line = strBuffer[0 .. (pos-1)]; 577 } else { 578 line = strBuffer[0 .. pos]; 579 } 580 //Pop the line, skipping the terminator: 581 strBuffer = strBuffer[(pos+1) .. $]; 582 } else { 583 //More needs to be read here. Copy the tail of the buffer 584 //to the beginning, and try to read with the empty part of 585 //the buffer. 586 //If no buffer was left, extend the size of the buffer before 587 //reading. If the file has ended, then the line is the entire 588 //buffer. 589 590 if (strBuffer.ptr != buffer.ptr) { 591 //Must use memmove because there might be overlap 592 memmove(buffer.ptr, strBuffer.ptr, strBuffer.length * char.sizeof); 593 } 594 ulong spaceBegin = strBuffer.length; 595 if (strBuffer.length == buffer.length) { 596 //Must extend the buffer to keep reading. 597 assumeSafeAppend(buffer); 598 buffer.length = buffer.length * 2; 599 } 600 char[] readPart = file.rawRead(buffer[spaceBegin .. $]); 601 if (readPart.length == 0) { 602 //End of the file. Return whats in the buffer. 603 //The next popFront() will try to read again, and then 604 //mark empty condition. 605 if (spaceBegin != 0 && buffer[spaceBegin-1] == '\r') { 606 line = buffer[0 .. spaceBegin-1]; 607 } else { 608 line = buffer[0 .. spaceBegin]; 609 } 610 strBuffer = null; 611 return; 612 } 613 strBuffer = buffer[0 .. spaceBegin + readPart.length]; 614 //Now that we have new data in strBuffer, we can go on. 615 //If a line isn't found, the buffer will be extended again to read more. 616 popFront(); 617 } 618 } 619 }