Implemented low space-complexity stream decoding for all ioformats
This commit is contained in:
parent
4c8c45ad04
commit
bf8a6454ae
@ -60,7 +60,7 @@ std::string DataFormatter::FormatBlock(
|
||||
);
|
||||
|
||||
default:
|
||||
throw std::invalid_argument("FormatBlock(): Iobase now found! Oh no. Anyway.");
|
||||
throw std::invalid_argument("FormatBlock(): Iobase not found! Oh no. Anyway.");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2,6 +2,7 @@
|
||||
#include "CommandlineInterface.h"
|
||||
#include "DataFormatter.h"
|
||||
#include "Bases.h"
|
||||
#include <StringTools/StringTools.h>
|
||||
#include <iostream>
|
||||
#include <istream>
|
||||
#include <fstream>
|
||||
@ -9,6 +10,7 @@
|
||||
#include <cstring>
|
||||
|
||||
using namespace IO;
|
||||
using namespace Leonetienne::StringTools;
|
||||
|
||||
void DataIngestionLayer::Init() {
|
||||
|
||||
@ -97,7 +99,7 @@ void DataIngestionLayer::ReadBlock() {
|
||||
case Configuration::IOBASE_FORMAT::BASE_8:
|
||||
case Configuration::IOBASE_FORMAT::BASE_10:
|
||||
case Configuration::IOBASE_FORMAT::BASE_16:
|
||||
case Configuration::IOBASE_FORMAT::BASE_64:
|
||||
case Configuration::IOBASE_FORMAT::BASE_64: {
|
||||
// Easy case: Each digit is exactly one char in size.
|
||||
// We can just calculate how many bytes we have to read.
|
||||
|
||||
@ -143,6 +145,147 @@ void DataIngestionLayer::ReadBlock() {
|
||||
break;
|
||||
}
|
||||
|
||||
case Configuration::IOBASE_FORMAT::BASE_UWU:
|
||||
case Configuration::IOBASE_FORMAT::BASE_UGH: {
|
||||
// The whole of Italy doesn't have as much spaghetti as this is...
|
||||
|
||||
// Hard case: Each digit is n digits long. Digits may vary in length.
|
||||
// They are seperated by spaces.
|
||||
// We have to parse them...
|
||||
std::string overshoot = ""; // this is how much we've read too much in the last iteration
|
||||
|
||||
|
||||
// Gets terminated by a break statement
|
||||
while (1) {
|
||||
// We'll read chunks of 64 bytes... This should be a good
|
||||
// median, to also support small multi-byte-digit sets
|
||||
std::size_t n_bytes_read = 0;
|
||||
int lastDigitPos = -1; // Should point the the space BEFORE it. Relative to chunk.
|
||||
std::size_t digitsCollected = 0;
|
||||
std::stringstream digits;
|
||||
bool foundBlock = false;
|
||||
|
||||
// Remember to prepend our overshoot from the previous iteration this chunk
|
||||
std::string chunk = overshoot + ReadBytes(64, n_bytes_read);
|
||||
|
||||
// We should also strip all linebreaks from the chunk, that may be a result of manual stdin input.
|
||||
chunk = StringTools::Replace(chunk, '\n', "");
|
||||
|
||||
// We can't just check for completeness by n_bytes_read...
|
||||
// It can be any number of bytes, since any digit is n bytes long...
|
||||
|
||||
// Parse the 64-byte chunk string we've just fetched:
|
||||
for (std::size_t i = 0; i < chunk.size(); i++) {
|
||||
|
||||
// If we are near the end, and have still not found a complete block, let's load an additional chunk
|
||||
if (i == chunk.size() - 2) {
|
||||
const std::string nextChunk = ReadBytes(64, n_bytes_read);
|
||||
if (n_bytes_read != 0) {
|
||||
chunk += StringTools::Replace(nextChunk, '\n', "");
|
||||
}
|
||||
}
|
||||
|
||||
// If i is on a space, or at the end of the chunk,
|
||||
// and, at least one of i, or lastDigitPos is on a space,
|
||||
if (
|
||||
(
|
||||
(chunk[i] == ' ') ||
|
||||
(i == chunk.size() - 1)
|
||||
) &&
|
||||
(
|
||||
(chunk[i] == ' ') ||
|
||||
lastDigitPos >= 0 // This basically does the same as discribed, but safer, as its default value is -1.
|
||||
)
|
||||
){
|
||||
digitsCollected++;
|
||||
|
||||
// We have found a digit. Let's store it away...
|
||||
// We're putting them in a stringstream, to fit the format required by the data formatter...
|
||||
|
||||
// We have a slight edgecase if we're currently on the last char. Then we do NOT want to read one short.
|
||||
// This is because we ususally stand on a seperator char (' '), which we do not want to extract. But in that case,
|
||||
// in which we're standing on the last char, it could be not a seperator char.
|
||||
// note: chunk[i] != ' ' can only be true if we're on the last char.
|
||||
if (chunk[i] == ' ') {
|
||||
digits <<
|
||||
chunk.substr(
|
||||
lastDigitPos + 1,
|
||||
(int)i - lastDigitPos - 1
|
||||
)
|
||||
;
|
||||
} else {
|
||||
digits <<
|
||||
chunk.substr(
|
||||
lastDigitPos + 1,
|
||||
(int)i - lastDigitPos
|
||||
)
|
||||
;
|
||||
}
|
||||
|
||||
// Add a seperator, if its not the last
|
||||
if (digitsCollected != blockWidth) {
|
||||
digits << ' ';
|
||||
}
|
||||
|
||||
lastDigitPos = i;
|
||||
|
||||
// Do we have enough digits to form a block?
|
||||
if (digitsCollected == blockWidth) {
|
||||
// We've found a complete block!
|
||||
|
||||
// Trim excess nullbytes off out digit string
|
||||
const std::string digitString = std::string(
|
||||
digits.str().data(),
|
||||
strlen(digits.str().data())
|
||||
);
|
||||
|
||||
// Decode it to a block object
|
||||
const Block newBlock = DataFormatter::DecodeFormat(
|
||||
digitString,
|
||||
Configuration::formatIn
|
||||
);
|
||||
|
||||
// Enqueue it to be processed by some module
|
||||
blocks.emplace(newBlock);
|
||||
foundBlock = true;
|
||||
|
||||
// Now we have to calculate how many bytes we've read TOO MANY.
|
||||
// We have to trim this current chunk to be our new overshoot.
|
||||
|
||||
// If we still have more than a byte left, leave out the current seperator char
|
||||
if (i < chunk.size() - 1) {
|
||||
overshoot = chunk.substr(i+1); // Take all bytes from the next iterator, to the end
|
||||
}
|
||||
// Else: we are on the last char: there is no overshoot
|
||||
else {
|
||||
overshoot = "";
|
||||
}
|
||||
|
||||
// Stop the for loop
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Exit-condition:
|
||||
// We have not found any block, not even any digit.
|
||||
if ((!foundBlock) && (digitsCollected == 0)) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Hard-abort: We have not finished reading a block
|
||||
if (!foundBlock) {
|
||||
throw std::runtime_error("DataIngestionLayer reached EOF whilst parsing multi-byte-digit block...");
|
||||
}
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
throw std::invalid_argument("DataFormatter::StringToBlocks() has been passed an unknown base! No switch-case matched!");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return;
|
||||
|
@ -131,7 +131,7 @@ void DataOutputLayer::AddTrailingLinebreakIfRequired() {
|
||||
if (
|
||||
(IsFinished()) &&
|
||||
(Configuration::outputTo == Configuration::OUTPUT_TO::STDOUT) &&
|
||||
(Configuration::formatIn != Configuration::IOBASE_FORMAT::BASE_BYTES) &&
|
||||
(Configuration::formatOut != Configuration::IOBASE_FORMAT::BASE_BYTES) &&
|
||||
(!CommandlineInterface::Get().HasParam("--no-newline"))
|
||||
) {
|
||||
*out << std::endl;
|
||||
|
Loading…
x
Reference in New Issue
Block a user