Implemented low space-complexity stream decoding for all ioformats

2022-06-01 01:53:49 +02:00 · 2022-06-01 01:53:49 +02:00 · bf8a6454ae
commit bf8a6454ae
parent 4c8c45ad04
3 changed files with 146 additions and 3 deletions
--- a/GCryptCLI/src/DataFormatter.cpp
+++ b/GCryptCLI/src/DataFormatter.cpp
@ -60,7 +60,7 @@ std::string DataFormatter::FormatBlock(
      );

    default:
-      throw std::invalid_argument("FormatBlock(): Iobase now found! Oh no. Anyway.");
+      throw std::invalid_argument("FormatBlock(): Iobase not found! Oh no. Anyway.");
  }
 }

--- a/GCryptCLI/src/DataIngestionLayer.cpp
+++ b/GCryptCLI/src/DataIngestionLayer.cpp
@ -2,6 +2,7 @@
 #include "CommandlineInterface.h"
 #include "DataFormatter.h"
 #include "Bases.h"
+#include <StringTools/StringTools.h>
 #include <iostream>
 #include <istream>
 #include <fstream>
@ -9,6 +10,7 @@
 #include <cstring>

 using namespace IO;
+using namespace Leonetienne::StringTools;

 void DataIngestionLayer::Init() {

@ -97,7 +99,7 @@ void DataIngestionLayer::ReadBlock() {
      case Configuration::IOBASE_FORMAT::BASE_8:
      case Configuration::IOBASE_FORMAT::BASE_10:
      case Configuration::IOBASE_FORMAT::BASE_16:
-      case Configuration::IOBASE_FORMAT::BASE_64:
+      case Configuration::IOBASE_FORMAT::BASE_64: {
        // Easy case: Each digit is exactly one char in size.
        // We can just calculate how many bytes we have to read.

@ -143,6 +145,147 @@ void DataIngestionLayer::ReadBlock() {
        break;
      }

+      case Configuration::IOBASE_FORMAT::BASE_UWU:
+      case Configuration::IOBASE_FORMAT::BASE_UGH: {
+        // The whole of Italy doesn't have as much spaghetti as this is...
+
+        // Hard case: Each digit is n digits long. Digits may vary in length.
+        // They are seperated by spaces.
+        // We have to parse them...
+        std::string overshoot = ""; // this is how much we've read too much in the last iteration
+
+
+        // Gets terminated by a break statement
+        while (1) {
+          // We'll read chunks of 64 bytes... This should be a good
+          // median, to also support small multi-byte-digit sets
+          std::size_t n_bytes_read = 0;
+          int lastDigitPos = -1; // Should point the the space BEFORE it. Relative to chunk.
+          std::size_t digitsCollected = 0;
+          std::stringstream digits;
+          bool foundBlock = false;
+
+          // Remember to prepend our overshoot from the previous iteration this chunk
+          std::string chunk = overshoot + ReadBytes(64, n_bytes_read);
+
+          // We should also strip all linebreaks from the chunk, that may be a result of manual stdin input.
+          chunk = StringTools::Replace(chunk, '\n', "");
+
+          // We can't just check for completeness by n_bytes_read...
+          // It can be any number of bytes, since any digit is n bytes long...
+
+          // Parse the 64-byte chunk string we've just fetched:
+          for (std::size_t i = 0; i < chunk.size(); i++) {
+
+            // If we are near the end, and have still not found a complete block, let's load an additional chunk
+            if (i == chunk.size() - 2) {
+              const std::string nextChunk = ReadBytes(64, n_bytes_read);
+              if (n_bytes_read != 0) {
+                chunk += StringTools::Replace(nextChunk, '\n', "");
+              }
+            }
+
+            // If i is on a space, or at the end of the chunk,
+            // and, at least one of i, or lastDigitPos is on a space,
+            if (
+                (
+                 (chunk[i] == ' ') ||
+                 (i == chunk.size() - 1)
+                ) &&
+                (
+                 (chunk[i] == ' ') ||
+                 lastDigitPos >= 0 // This basically does the same as discribed, but safer, as its default value is -1.
+                )
+            ){
+              digitsCollected++;
+
+              // We have found a digit. Let's store it away...
+              // We're putting them in a stringstream, to fit the format required by the data formatter...
+
+              // We have a slight edgecase if we're currently on the last char. Then we do NOT want to read one short.
+              // This is because we ususally stand on a seperator char (' '), which we do not want to extract. But in that case,
+              // in which we're standing on the last char, it could be not a seperator char.
+              // note: chunk[i] != ' ' can only be true if we're on the last char.
+              if (chunk[i] == ' ') {
+                digits <<
+                  chunk.substr(
+                    lastDigitPos + 1,
+                    (int)i - lastDigitPos - 1
+                  )
+                ;
+              } else {
+                digits <<
+                  chunk.substr(
+                    lastDigitPos + 1,
+                    (int)i - lastDigitPos
+                  )
+                ;
+              }
+
+              // Add a seperator, if its not the last
+              if (digitsCollected != blockWidth) {
+                digits << ' ';
+              }
+
+              lastDigitPos = i;
+
+              // Do we have enough digits to form a block?
+              if (digitsCollected == blockWidth) {
+                // We've found a complete block!
+
+                // Trim excess nullbytes off out digit string
+                const std::string digitString = std::string(
+                  digits.str().data(),
+                  strlen(digits.str().data())
+                );
+
+                // Decode it to a block object
+                const Block newBlock = DataFormatter::DecodeFormat(
+                  digitString,
+                  Configuration::formatIn
+                );
+
+                // Enqueue it to be processed by some module
+                blocks.emplace(newBlock);
+                foundBlock = true;
+
+                // Now we have to calculate how many bytes we've read TOO MANY.
+                // We have to trim this current chunk to be our new overshoot.
+
+                // If we still have more than a byte left, leave out the current seperator char
+                if (i < chunk.size() - 1) {
+                  overshoot = chunk.substr(i+1); // Take all bytes from the next iterator, to the end
+                }
+                // Else: we are on the last char: there is no overshoot
+                else {
+                  overshoot = "";
+                }
+
+                // Stop the for loop
+                break;
+              }
+            }
+          }
+
+          // Exit-condition:
+          // We have not found any block, not even any digit.
+          if ((!foundBlock) && (digitsCollected == 0)) {
+            break;
+          }
+
+          // Hard-abort: We have not finished reading a block
+          if (!foundBlock) {
+            throw std::runtime_error("DataIngestionLayer reached EOF whilst parsing multi-byte-digit block...");
+          }
+        }
+
+        break;
+      }
+
+      default:
+        throw std::invalid_argument("DataFormatter::StringToBlocks() has been passed an unknown base! No switch-case matched!");
+      }
+
  }

  return;
--- a/GCryptCLI/src/DataOutputLayer.cpp
+++ b/GCryptCLI/src/DataOutputLayer.cpp
@ -131,7 +131,7 @@ void DataOutputLayer::AddTrailingLinebreakIfRequired() {
  if (
    (IsFinished()) &&
    (Configuration::outputTo == Configuration::OUTPUT_TO::STDOUT) &&
-    (Configuration::formatIn != Configuration::IOBASE_FORMAT::BASE_BYTES) &&
+    (Configuration::formatOut != Configuration::IOBASE_FORMAT::BASE_BYTES) &&
    (!CommandlineInterface::Get().HasParam("--no-newline"))
  ) {
    *out << std::endl;