mirror of https://github.com/Gnucash/gnucash
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
257 lines
9.7 KiB
257 lines
9.7 KiB
/********************************************************************
|
|
* test-tokenizer.cpp: test suite for the csv tokenizer class and *
|
|
* its child classes. *
|
|
* Copyright 2016 Geert Janssens <geert.gnucash@kobaltwit.be> *
|
|
* *
|
|
* This program is free software; you can redistribute it and/or *
|
|
* modify it under the terms of the GNU General Public License as *
|
|
* published by the Free Software Foundation; either version 2 of *
|
|
* the License, or (at your option) any later version. *
|
|
* *
|
|
* This program is distributed in the hope that it will be useful, *
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
|
|
* GNU General Public License for more details. *
|
|
* *
|
|
* You should have received a copy of the GNU General Public License*
|
|
* along with this program; if not, you can retrieve it from *
|
|
* http://www.gnu.org/licenses/old-licenses/gpl-2.0.html *
|
|
* or contact: *
|
|
* *
|
|
* Free Software Foundation Voice: +1-617-542-5942 *
|
|
* 51 Franklin Street, Fifth Floor Fax: +1-617-542-2652 *
|
|
* Boston, MA 02110-1301, USA gnu@gnu.org *
|
|
********************************************************************/
|
|
|
|
|
|
#include <guid.hpp>
|
|
#include "../gnc-tokenizer.hpp"
|
|
#include "../gnc-tokenizer-csv.hpp"
|
|
#include "../gnc-tokenizer-fw.hpp"
|
|
#include <gtest/gtest.h>
|
|
#include <iostream>
|
|
#include <fstream> // fstream
|
|
|
|
#include <string>
|
|
#include <stdlib.h> /* getenv */
|
|
|
|
|
|
typedef struct
|
|
{
|
|
const char *csv_line;
|
|
uint num_fields;
|
|
const char *fields [8];
|
|
} tokenize_csv_test_data;
|
|
|
|
typedef struct
|
|
{
|
|
const char *fw_line;
|
|
uint num_fields;
|
|
uint field_widths[8];
|
|
const char *fields [8];
|
|
} tokenize_fw_test_data;
|
|
|
|
class GncTokenizerTest : public ::testing::Test
|
|
{
|
|
public:
|
|
GncTokenizerTest()
|
|
{
|
|
fw_tok = gnc_tokenizer_factory(GncImpFileFormat::FIXED_WIDTH);
|
|
csv_tok = gnc_tokenizer_factory(GncImpFileFormat::CSV);
|
|
}
|
|
|
|
std::string get_filepath(const std::string& filename);
|
|
|
|
protected:
|
|
std::string& get_utf8_contents(std::unique_ptr<GncTokenizer> &tokenizer)
|
|
{ return tokenizer->m_utf8_contents; }
|
|
void set_utf8_contents(std::unique_ptr<GncTokenizer> &tokenizer, const std::string& newcontents)
|
|
{ tokenizer->m_utf8_contents = newcontents; }
|
|
void test_gnc_tokenize_helper (const std::string& separators, tokenize_csv_test_data* test_data); // for csv tokenizer
|
|
void test_gnc_tokenize_helper (tokenize_fw_test_data* test_data); // for csv tokenizer
|
|
|
|
std::unique_ptr<GncTokenizer> fw_tok;
|
|
std::unique_ptr<GncTokenizer> csv_tok;
|
|
};
|
|
|
|
std::string GncTokenizerTest::get_filepath(const std::string& filename)
|
|
{
|
|
|
|
auto srcdir = getenv("SRCDIR");
|
|
if (!srcdir)
|
|
return filename;
|
|
else
|
|
return std::string(srcdir) + "/" + filename;
|
|
}
|
|
|
|
TEST_F (GncTokenizerTest, load_file_nonexisting)
|
|
{
|
|
|
|
auto file1 = get_filepath ("notexist.csv");
|
|
|
|
/* Test loading of a non-existing file */
|
|
EXPECT_THROW (fw_tok->load_file (file1), std::ios_base::failure);
|
|
EXPECT_THROW (csv_tok->load_file (file1), std::ios_base::failure);
|
|
}
|
|
|
|
TEST_F (GncTokenizerTest, load_file_existing)
|
|
{
|
|
|
|
auto file = get_filepath ("sample1.csv");
|
|
auto expected_contents = std::string(
|
|
"Date,Num,Description,Notes,Account,Deposit,Withdrawal,Balance\n"
|
|
"05/01/15,45,Acme Inc.,,Miscellaneous,,\"1,100.00\",\n");
|
|
|
|
ASSERT_NO_THROW (fw_tok->load_file (file))
|
|
<< "File " << file << " not found. Perhaps you should set the SRCDIR environment variable to point to its containing directory ?";
|
|
ASSERT_NO_THROW (csv_tok->load_file (file))
|
|
<< "File " << file << " not found. Perhaps you should set the SRCDIR environment variable to point to its containing directory ?";
|
|
|
|
EXPECT_EQ(expected_contents, get_utf8_contents (fw_tok));
|
|
EXPECT_EQ(expected_contents, get_utf8_contents (csv_tok));
|
|
}
|
|
|
|
TEST_F (GncTokenizerTest, tokenize_from_csv_file)
|
|
{
|
|
|
|
auto file = get_filepath ("sample1.csv");
|
|
auto expected_contents = std::string(
|
|
"Date,Num,Description,Notes,Account,Deposit,Withdrawal,Balance\n"
|
|
"05/01/15,45,Acme Inc.,,Miscellaneous,,\"1,100.00\",\n");
|
|
|
|
csv_tok->load_file (file);
|
|
csv_tok->tokenize();
|
|
auto tokens = csv_tok->get_tokens();
|
|
EXPECT_EQ(2ul, tokens.size());
|
|
EXPECT_EQ(8ul, tokens[0].size());
|
|
EXPECT_EQ(8ul, tokens[1].size());
|
|
EXPECT_EQ(std::string("Date"), tokens.at(0).at(0));
|
|
EXPECT_EQ(std::string("1,100.00"), tokens.at(1).at(6));
|
|
}
|
|
|
|
/* Test parsing for several different prepared strings
|
|
* These tests bypass file loading, rather taking a
|
|
* prepared set of strings as input. This makes it
|
|
* easier to add test cases without having to create new test files
|
|
* each time to load from.
|
|
* Note this bypasses encoding configuration, which should be tested
|
|
* independently.
|
|
*/
|
|
|
|
/* First test whether we're properly catching boost::tokenizer throws
|
|
* This happens when the input data has invalid escape sequences */
|
|
TEST_F (GncTokenizerTest, tokenize_binary_data)
|
|
{
|
|
GncCsvTokenizer *csvtok = dynamic_cast<GncCsvTokenizer*>(csv_tok.get());
|
|
csvtok->set_separators (",");
|
|
|
|
set_utf8_contents (csv_tok, R"(\764Test,Something)");
|
|
EXPECT_THROW (csv_tok->tokenize(), std::range_error);
|
|
}
|
|
|
|
/* This helper function will run the parse step on the given data
|
|
* with the parser as configured by the calling test function.
|
|
* This allows the same code to be used with different csv test strings
|
|
* and parser option combinations.
|
|
*/
|
|
void
|
|
GncTokenizerTest::test_gnc_tokenize_helper (const std::string& separators, tokenize_csv_test_data* test_data)
|
|
{
|
|
|
|
GncCsvTokenizer *csvtok = dynamic_cast<GncCsvTokenizer*>(csv_tok.get());
|
|
csvtok->set_separators (separators);
|
|
|
|
int i = 0;
|
|
while (test_data[i].csv_line)
|
|
{
|
|
|
|
tokenize_csv_test_data cur_line = test_data[i];
|
|
set_utf8_contents (csv_tok, std::string(cur_line.csv_line));
|
|
csv_tok->tokenize();
|
|
|
|
// The tests only come with one line, so get the first row only
|
|
auto line_tok = csv_tok->get_tokens().front();
|
|
EXPECT_EQ (cur_line.num_fields, line_tok.size());
|
|
for (auto j = 0ul; j < cur_line.num_fields; j++)
|
|
{
|
|
EXPECT_EQ (std::string (cur_line.fields[j]), line_tok[j]);
|
|
}
|
|
|
|
i++;
|
|
}
|
|
}
|
|
|
|
static tokenize_csv_test_data comma_separated [] = {
|
|
{ "Date,Num,Description,Notes,Account,Deposit,Withdrawal,Balance", 8, { "Date","Num","Description","Notes","Account","Deposit","Withdrawal","Balance" } },
|
|
{ "05/01/15,45,Acme Inc.,,Miscellaneous,,\"1,100.00\",", 8, { "05/01/15","45","Acme Inc.","","Miscellaneous","","1,100.00","" } },
|
|
{ "05/01/15,45,Acme Inc.,,Miscellaneous,", 6, { "05/01/15","45","Acme Inc.","","Miscellaneous","",NULL,NULL } },
|
|
{ NULL, 0, { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL } },
|
|
};
|
|
|
|
TEST_F (GncTokenizerTest, tokenize_comma_sep)
|
|
{
|
|
test_gnc_tokenize_helper (",", comma_separated);
|
|
}
|
|
|
|
static tokenize_csv_test_data semicolon_separated [] = {
|
|
{ "Date;Num;Description;Notes;Account;Deposit;Withdrawal;Balance", 8, { "Date","Num","Description","Notes","Account","Deposit","Withdrawal","Balance" } },
|
|
{ "05/01/15;45;Acme Inc.;;Miscellaneous;;\"1,100.00\";", 8, { "05/01/15","45","Acme Inc.","","Miscellaneous","","1,100.00","" } },
|
|
{ "05/01/15;45;Acme Inc.;;Miscellaneous;", 6, { "05/01/15","45","Acme Inc.","","Miscellaneous","",NULL,NULL } },
|
|
{ NULL, 0, { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL } },
|
|
};
|
|
TEST_F (GncTokenizerTest, tokenize_semicolon_sep)
|
|
{
|
|
test_gnc_tokenize_helper (";", semicolon_separated);
|
|
}
|
|
|
|
|
|
|
|
void
|
|
GncTokenizerTest::test_gnc_tokenize_helper (tokenize_fw_test_data* test_data)
|
|
{
|
|
|
|
GncFwTokenizer *fwtok = dynamic_cast<GncFwTokenizer*>(fw_tok.get());
|
|
|
|
int i = 0;
|
|
while (test_data[i].fw_line)
|
|
{
|
|
tokenize_fw_test_data cur_line = test_data[i];
|
|
set_utf8_contents (fw_tok, std::string(cur_line.fw_line));
|
|
auto columns = std::vector<uint>();
|
|
for (auto j = 0ul; j < cur_line.num_fields; j++)
|
|
columns.push_back (cur_line.field_widths[j]);
|
|
fwtok->columns (columns);
|
|
fw_tok->tokenize();
|
|
|
|
// The tests only come with one line, so get the first row only
|
|
auto line_tok = fw_tok->get_tokens().front();
|
|
EXPECT_EQ (cur_line.num_fields, line_tok.size());
|
|
for (auto j = 0ul; j < cur_line.num_fields; j++)
|
|
{
|
|
EXPECT_EQ (std::string (cur_line.fields[j]), line_tok[j]);
|
|
}
|
|
|
|
i++;
|
|
}
|
|
}
|
|
|
|
static tokenize_fw_test_data fixed_width [] = {
|
|
{ "Date NumDescriptionAccountDeposit", 5,
|
|
{ 8,3,11,7,7,0,0,0 },
|
|
{ "Date","Num","Description","Account","Deposit",NULL,NULL,NULL } },
|
|
{ "05/01/1545Acme Inc.Miscellaneous1,100.00", 5,
|
|
{ 8,2,9,13,8,0,0,0 },
|
|
{ "05/01/15","45","Acme Inc.","Miscellaneous","1,100.00",NULL,NULL,NULL } },
|
|
{ "05/01/15 45 Acme Inc.Miscellaneous 1,100.00", 6,
|
|
{ 10,4,9,13,8,8,0,0 },
|
|
{ "05/01/15","45","Acme Inc.","Miscellaneous","","1,100.00",NULL,NULL } },
|
|
{ NULL, 0,
|
|
{ 0,0,0,0,0,0,0,0 },
|
|
{ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL } },
|
|
};
|
|
|
|
TEST_F (GncTokenizerTest, tokenize_fw)
|
|
{
|
|
test_gnc_tokenize_helper (fixed_width);
|
|
}
|