2022-05-27 18:26:48 +02:00
# include "DataIngestionLayer.h"
# include "CommandlineInterface.h"
2022-05-31 21:40:13 +02:00
# include "DataFormatter.h"
2022-05-31 15:32:17 +02:00
# include "Bases.h"
2022-06-01 01:53:49 +02:00
# include <StringTools/StringTools.h>
2022-05-27 18:26:48 +02:00
# include <iostream>
2022-05-31 14:25:23 +02:00
# include <istream>
# include <fstream>
# include <sstream>
2022-05-27 18:26:48 +02:00
# include <cstring>
using namespace IO ;
2022-06-01 01:53:49 +02:00
using namespace Leonetienne : : StringTools ;
2022-05-27 18:26:48 +02:00
void DataIngestionLayer : : Init ( ) {
// Set our istream
switch ( Configuration : : inputFrom ) {
// Are we reading from stdin?
case Configuration : : INPUT_FROM : : STDIN :
// Redirect our istream to stdin
in = & std : : cin ;
break ;
// Are we reading from a file?
case Configuration : : INPUT_FROM : : FILE :
// Open the file
ifs . open (
Configuration : : inputFilename ,
std : : ios : : in | std : : ios : : binary
) ;
// A little bit of error handling
if ( ! ifs . good ( ) ) {
2022-05-31 17:06:13 +02:00
throw std : : runtime_error ( " Unable to open infilestream! " ) ;
2022-05-27 18:26:48 +02:00
}
// Redirect our istream to this infilestream
in = & ifs ;
break ;
2022-05-31 17:06:13 +02:00
// Are we reading from a parameter?
case Configuration : : INPUT_FROM : : PARAMETER :
2022-05-27 18:26:48 +02:00
// Create an instringstream with our parameter
iss = std : : istringstream (
CommandlineInterface : : Get ( ) [ " --intext " ] . GetString ( )
) ;
2022-05-31 10:44:05 +02:00
// Redirect our istream to this instringstream
2022-05-27 18:26:48 +02:00
in = & iss ;
break ;
}
2022-05-31 21:40:13 +02:00
// Derive from our the current module if we're reading ciphertext or not
2022-06-01 15:28:17 +02:00
if ( Configuration : : activeModule = = Configuration : : MODULE : : DECRYPTION ) {
2022-05-31 21:40:13 +02:00
isReadingCiphertext = true ;
}
else {
isReadingCiphertext = false ;
}
2022-05-31 14:25:23 +02:00
initialized = true ;
reachedEof = false ;
2022-06-01 03:28:31 +02:00
nBlocksRead = 0 ;
2022-05-31 14:25:23 +02:00
2022-05-31 15:32:17 +02:00
return ;
}
2022-05-27 18:26:48 +02:00
2022-05-31 15:32:17 +02:00
void DataIngestionLayer : : Destruct ( ) {
2022-05-27 18:26:48 +02:00
2022-05-31 15:32:17 +02:00
if ( Configuration : : inputFrom = = Configuration : : INPUT_FROM : : FILE ) {
ifs . close ( ) ;
}
2022-05-27 18:26:48 +02:00
return ;
}
2022-05-31 14:25:23 +02:00
void DataIngestionLayer : : ReadBlock ( ) {
if ( ! initialized ) {
throw std : : runtime_error ( " Attempted to read on uninitialized DataIngestionLayer! " ) ;
}
if ( ! reachedEof ) {
2022-05-31 21:40:13 +02:00
// A block is this many digits wide, in encoding
const std : : size_t blockWidth = blockLengthByBase [ Configuration : : formatIn ] ;
// Iterate over the string, and parse all blocks
// We now have to differentiate between single-char digit sets (like hex),
// and multi-char digit sets (like uwu):
switch ( Configuration : : formatIn ) {
case Configuration : : IOBASE_FORMAT : : BASE_BYTES :
case Configuration : : IOBASE_FORMAT : : BASE_2 :
case Configuration : : IOBASE_FORMAT : : BASE_8 :
case Configuration : : IOBASE_FORMAT : : BASE_10 :
case Configuration : : IOBASE_FORMAT : : BASE_16 :
2022-06-01 01:53:49 +02:00
case Configuration : : IOBASE_FORMAT : : BASE_64 : {
2022-05-31 21:40:13 +02:00
// Easy case: Each digit is exactly one char in size.
// We can just calculate how many bytes we have to read.
// bytesRead is always of the correct length, 0-padded.
std : : size_t n_bytes_read ;
const std : : string dataRead = ReadBytes ( blockWidth , n_bytes_read ) ;
// If we've read 0 bytes, this was the last block
// and it's completely empty. We can abort without doing anything.
// The ReadBytes function takes care of setting the reachedEof flag.
if ( n_bytes_read = = 0 ) {
return ;
}
// If we are reading ciphertext
// make sure we've read enough bytes to compose a block.
if (
( isReadingCiphertext ) & &
( n_bytes_read < blockWidth )
) {
throw std : : runtime_error ( " DataIngestionLayer::ReadBlock() read an input-data fragment that is smaller than a data block should be. Is your cipher text incomplete? " ) ;
}
// This should decode to a block just like this.
Block newBlock ;
// Special-case: We are reading cleartext (no ciphertext)
// cleartext is always base_bytes
if ( ! isReadingCiphertext ) {
// When just reading cleartext-bytes, we also allow shorter strings
// than BLOCK_SIZE. These will just get zero-padded.
newBlock . FromTextString ( dataRead ) ;
}
else {
// Else: recode to a block.
newBlock = DataFormatter : : DecodeFormat (
dataRead ,
Configuration : : formatIn
) ;
}
blocks . emplace ( newBlock ) ;
2022-06-01 03:28:31 +02:00
nBlocksRead + + ;
2022-05-31 21:40:13 +02:00
break ;
}
2022-05-31 15:32:17 +02:00
2022-06-01 01:53:49 +02:00
case Configuration : : IOBASE_FORMAT : : BASE_UWU :
case Configuration : : IOBASE_FORMAT : : BASE_UGH : {
// The whole of Italy doesn't have as much spaghetti as this is...
// Hard case: Each digit is n digits long. Digits may vary in length.
// They are seperated by spaces.
// We have to parse them...
std : : string overshoot = " " ; // this is how much we've read too much in the last iteration
// Gets terminated by a break statement
while ( 1 ) {
// We'll read chunks of 64 bytes... This should be a good
// median, to also support small multi-byte-digit sets
std : : size_t n_bytes_read = 0 ;
int lastDigitPos = - 1 ; // Should point the the space BEFORE it. Relative to chunk.
std : : size_t digitsCollected = 0 ;
std : : stringstream digits ;
bool foundBlock = false ;
// Remember to prepend our overshoot from the previous iteration this chunk
std : : string chunk = overshoot + ReadBytes ( 64 , n_bytes_read ) ;
// We should also strip all linebreaks from the chunk, that may be a result of manual stdin input.
chunk = StringTools : : Replace ( chunk , ' \n ' , " " ) ;
// We can't just check for completeness by n_bytes_read...
// It can be any number of bytes, since any digit is n bytes long...
// Parse the 64-byte chunk string we've just fetched:
for ( std : : size_t i = 0 ; i < chunk . size ( ) ; i + + ) {
// If we are near the end, and have still not found a complete block, let's load an additional chunk
if ( i = = chunk . size ( ) - 2 ) {
const std : : string nextChunk = ReadBytes ( 64 , n_bytes_read ) ;
if ( n_bytes_read ! = 0 ) {
chunk + = StringTools : : Replace ( nextChunk , ' \n ' , " " ) ;
}
}
// If i is on a space, or at the end of the chunk,
// and, at least one of i, or lastDigitPos is on a space,
if (
(
( chunk [ i ] = = ' ' ) | |
( i = = chunk . size ( ) - 1 )
) & &
(
( chunk [ i ] = = ' ' ) | |
lastDigitPos > = 0 // This basically does the same as discribed, but safer, as its default value is -1.
)
) {
digitsCollected + + ;
// We have found a digit. Let's store it away...
// We're putting them in a stringstream, to fit the format required by the data formatter...
// We have a slight edgecase if we're currently on the last char. Then we do NOT want to read one short.
// This is because we ususally stand on a seperator char (' '), which we do not want to extract. But in that case,
// in which we're standing on the last char, it could be not a seperator char.
// note: chunk[i] != ' ' can only be true if we're on the last char.
if ( chunk [ i ] = = ' ' ) {
digits < <
chunk . substr (
lastDigitPos + 1 ,
( int ) i - lastDigitPos - 1
)
;
} else {
digits < <
chunk . substr (
lastDigitPos + 1 ,
( int ) i - lastDigitPos
)
;
}
// Add a seperator, if its not the last
if ( digitsCollected ! = blockWidth ) {
digits < < ' ' ;
}
lastDigitPos = i ;
// Do we have enough digits to form a block?
if ( digitsCollected = = blockWidth ) {
// We've found a complete block!
// Trim excess nullbytes off out digit string
const std : : string digitString = std : : string (
digits . str ( ) . data ( ) ,
strlen ( digits . str ( ) . data ( ) )
) ;
// Decode it to a block object
const Block newBlock = DataFormatter : : DecodeFormat (
digitString ,
Configuration : : formatIn
) ;
// Enqueue it to be processed by some module
blocks . emplace ( newBlock ) ;
2022-06-01 03:28:31 +02:00
nBlocksRead + + ;
2022-06-01 01:53:49 +02:00
foundBlock = true ;
// Now we have to calculate how many bytes we've read TOO MANY.
// We have to trim this current chunk to be our new overshoot.
// If we still have more than a byte left, leave out the current seperator char
if ( i < chunk . size ( ) - 1 ) {
overshoot = chunk . substr ( i + 1 ) ; // Take all bytes from the next iterator, to the end
}
// Else: we are on the last char: there is no overshoot
else {
overshoot = " " ;
}
// Stop the for loop
break ;
}
}
}
// Exit-condition:
// We have not found any block, not even any digit.
if ( ( ! foundBlock ) & & ( digitsCollected = = 0 ) ) {
break ;
}
// Hard-abort: We have not finished reading a block
if ( ! foundBlock ) {
throw std : : runtime_error ( " DataIngestionLayer reached EOF whilst parsing multi-byte-digit block... " ) ;
}
}
break ;
}
default :
throw std : : invalid_argument ( " DataFormatter::StringToBlocks() has been passed an unknown base! No switch-case matched! " ) ;
}
2022-05-31 21:40:13 +02:00
}
return ;
}
2022-05-31 14:25:23 +02:00
2022-05-31 21:40:13 +02:00
std : : string DataIngestionLayer : : ReadBytes ( const std : : size_t n , std : : size_t & out_bytes_read ) {
2022-05-31 14:25:23 +02:00
2022-05-31 21:40:13 +02:00
// Prepare a buffer to read to
char * buf = new char [ n + 1 ] ;
memset ( buf , 0 , ( n + 1 ) * sizeof ( buf [ 0 ] ) ) ;
2022-05-31 14:25:23 +02:00
2022-05-31 21:40:13 +02:00
// Read
in - > read ( buf , n * sizeof ( buf [ 0 ] ) ) ;
2022-05-31 14:25:23 +02:00
2022-05-31 21:40:13 +02:00
// Fetch how much we've read
out_bytes_read = in - > gcount ( ) ;
2022-05-31 14:25:23 +02:00
2022-05-31 21:40:13 +02:00
// Is this fewer bytes than got requested?
if ( out_bytes_read < n ) {
// Yes: EOF reached.
reachedEof = true ;
2022-05-31 14:25:23 +02:00
}
2022-05-31 21:40:13 +02:00
// Translate buffer to a standard string
const std : : string sbuf ( buf , n ) ;
delete [ ] buf ;
// Return our buffer
return sbuf ;
2022-05-31 14:25:23 +02:00
}
bool DataIngestionLayer : : ReachedEOF ( ) {
return reachedEof ;
}
bool DataIngestionLayer : : IsBlockReady ( ) {
2022-08-12 09:12:44 +02:00
// We're not ready, if we haven't reached EOF, if we should buffer
2022-05-31 15:32:17 +02:00
// the input.
if (
2022-08-12 09:12:44 +02:00
( CommandlineInterface : : Get ( ) . HasParam ( " --buffer-input " ) ) & &
2022-05-31 15:32:17 +02:00
( ! reachedEof )
) {
return false ;
}
2022-08-12 09:12:44 +02:00
// If we're not buffering, just return whether or not
2022-05-31 15:32:17 +02:00
// we have any blocks...
2022-05-31 14:25:23 +02:00
return blocks . size ( ) > 0 ;
}
2022-05-31 15:32:17 +02:00
bool DataIngestionLayer : : IsFinished ( ) {
return ( reachedEof ) & & ( blocks . size ( ) = = 0 ) ;
}
2022-05-31 14:25:23 +02:00
Block DataIngestionLayer : : GetNextBlock ( ) {
if ( ! IsBlockReady ( ) ) {
throw std : : runtime_error ( " Attempted to get the next block, but there are none left! " ) ;
}
// Why... why not just return a T in pop()???
const Block popped = blocks . front ( ) ;
blocks . pop ( ) ;
return popped ;
}
2022-05-27 18:26:48 +02:00
2022-06-01 03:28:31 +02:00
std : : size_t DataIngestionLayer : : NBlocksRead ( ) {
return nBlocksRead ;
}
2022-05-27 18:26:48 +02:00
std : : istream * DataIngestionLayer : : in ;
std : : ifstream DataIngestionLayer : : ifs ;
std : : istringstream DataIngestionLayer : : iss ;
2022-05-31 14:25:23 +02:00
bool DataIngestionLayer : : reachedEof = false ;
bool DataIngestionLayer : : initialized = false ;
2022-05-31 21:40:13 +02:00
bool DataIngestionLayer : : isReadingCiphertext ;
2022-06-01 03:28:31 +02:00
std : : size_t DataIngestionLayer : : nBlocksRead = 0 ;
2022-05-31 14:25:23 +02:00
std : : queue < Block > DataIngestionLayer : : blocks ;
2022-05-27 18:26:48 +02:00