package dsv

import "git.sr.ht/~shulhan/pakakeh.go/lib/dsv"

Package dsv is a library for working with delimited separated value (DSV).

DSV is a free-style form of Comma Separated Value (CSV) format of text data, where each row is separated by newline, and each column can be separated by any string enclosed with left-quote and right-quote.

Index

Constants

const (
	// DefaultRejected define the default file which will contain the
	// rejected row.
	DefaultRejected = "rejected.dat"

	// DefaultMaxRows define default maximum row that will be saved
	// in memory for each read if input data is too large and can not be
	// consumed in one read operation.
	DefaultMaxRows = 256

	// DefDatasetMode default output mode is rows.
	DefDatasetMode = DatasetModeROWS

	// DefEOL default end-of-line
	DefEOL = '\n'
)
const (
	// DatasetModeROWS is a string representation of output mode rows.
	DatasetModeROWS = "ROWS"
	// DatasetModeCOLUMNS is a string representation of output mode columns.
	DatasetModeCOLUMNS = "COLUMNS"
	// DatasetModeMATRIX will save data in rows and columns. This mode will
	// consume more memory that "rows" and "columns" but give greater
	// flexibility when working with data.
	DatasetModeMATRIX = "MATRIX"
)
const (

	// EReadMissLeftQuote read error when no left-quote found on line.
	EReadMissLeftQuote
	// EReadMissRightQuote read error when no right-quote found on line.
	EReadMissRightQuote
	// EReadMissSeparator read error when no separator found on line.
	EReadMissSeparator
	// EReadLine error when reading line from file.
	EReadLine
	// EReadEOF error which indicated end-of-file.
	EReadEOF
	// ETypeConversion error when converting type from string to numeric or
	// vice versa.
	ETypeConversion
)
const (
	// DefSeparator default separator that will be used if its not given
	// in config file.
	DefSeparator = ","
	// DefOutput file.
	DefOutput = "output.dat"
	// DefEscape default string to escape the right quote or separator.
	DefEscape = "\\"
)

Variables

var (
	// ErrNoInput define an error when no Input file is given to Reader.
	ErrNoInput = errors.New("dsv: No input file is given in config")

	// ErrMissRecordsLen define an error when trying to push Row
	// to Field, when their length is not equal.
	// See reader.PushRowToColumns().
	ErrMissRecordsLen = errors.New("dsv: Mismatch between number of record in row and columns length")

	// ErrNoOutput define an error when no output file is given to Writer.
	ErrNoOutput = errors.New("dsv: No output file is given in config")

	// ErrNotOpen define an error when output file has not been opened
	// by Writer.
	ErrNotOpen = errors.New("dsv: Output file is not opened")

	// ErrNilReader define an error when Reader object is nil when passed
	// to Write function.
	ErrNilReader = errors.New("dsv: Reader object is nil")
)

Functions

func ConfigCheckPath

func ConfigCheckPath(comin ConfigInterface, file string) string

ConfigCheckPath if no path in file, return the config path plus file name, otherwise leave it unchanged.

func ConfigOpen

func ConfigOpen(rw any, fcfg string) error

ConfigOpen configuration file and initialize the attributes.

func ConfigParse

func ConfigParse(rw any, cfg []byte) error

ConfigParse from JSON string.

func InitWriter

func InitWriter(writer WriterInterface) error

InitWriter initialize writer by opening output file.

func OpenWriter

func OpenWriter(writer WriterInterface, fcfg string) (e error)

OpenWriter configuration file and initialize the attributes.

func Read

func Read(reader ReaderInterface) (n int, e error)

Read row from input file.

func SimpleWrite

func SimpleWrite(reader ReaderInterface, fcfg string) (nrows int, e error)

SimpleWrite provide a shortcut to write data from reader using output metadata format and output file defined in file `fcfg`.

Types

type Config

type Config struct {
	// ConfigPath path to configuration file.
	ConfigPath string
}

Config for working with DSV configuration.

func (*Config) GetConfigPath

func (cfg *Config) GetConfigPath() string

GetConfigPath return the base path of configuration file.

func (*Config) SetConfigPath

func (cfg *Config) SetConfigPath(dir string)

SetConfigPath for reading input and writing rejected file.

type ConfigInterface

type ConfigInterface interface {
	GetConfigPath() string
	SetConfigPath(dir string)
}

ConfigInterface for reader and writer for initializing the config from JSON.

type Metadata

type Metadata struct {
	// Name of the column, optional.
	Name string `json:"Name"`

	// Type of the column, default to "string".
	// Valid value are: "string", "integer", "real"
	Type string `json:"Type"`

	// Separator for column in record.
	Separator string `json:"Separator"`

	// LeftQuote define the characters that enclosed the column in the left
	// side.
	LeftQuote string `json:"LeftQuote"`

	// RightQuote define the characters that enclosed the column in the
	// right side.
	RightQuote string `json:"RightQuote"`

	// ValueSpace contain the possible value in records
	ValueSpace []string `json:"ValueSpace"`

	// T type of column in integer.
	T int `json:"T"`

	// Skip, if its true this column will be ignored, not saved in reader
	// object. Default to false.
	Skip bool `json:"Skip"`
}

Metadata represent on how to parse each column in record.

func NewMetadata

func NewMetadata(name, tipe, sep, leftq, rightq string, vs []string) (
	md *Metadata,
)

NewMetadata create and return new metadata.

func (*Metadata) GetLeftQuote

func (md *Metadata) GetLeftQuote() string

GetLeftQuote return the string used in the beginning of record value.

func (*Metadata) GetName

func (md *Metadata) GetName() string

GetName return the name of metadata.

func (*Metadata) GetRightQuote

func (md *Metadata) GetRightQuote() string

GetRightQuote return string that end in record value.

func (*Metadata) GetSeparator

func (md *Metadata) GetSeparator() string

GetSeparator return the field separator.

func (*Metadata) GetSkip

func (md *Metadata) GetSkip() bool

GetSkip return number of rows that will be skipped when reading data.

func (*Metadata) GetType

func (md *Metadata) GetType() int

GetType return type of metadata.

func (*Metadata) GetTypeName

func (md *Metadata) GetTypeName() string

GetTypeName return string representation of type.

func (*Metadata) GetValueSpace

func (md *Metadata) GetValueSpace() []string

GetValueSpace return value space.

func (*Metadata) Init

func (md *Metadata) Init()

Init initialize metadata column, i.e. check and set column type.

If type is unknown it will default to string.

func (*Metadata) IsEqual

func (md *Metadata) IsEqual(o MetadataInterface) bool

IsEqual return true if this metadata equal with other instance, return false otherwise.

func (*Metadata) String

func (md *Metadata) String() string

String yes, it will print it JSON like format.

type MetadataInterface

type MetadataInterface interface {
	Init()
	GetName() string
	GetType() int
	GetTypeName() string
	GetLeftQuote() string
	GetRightQuote() string
	GetSeparator() string
	GetSkip() bool
	GetValueSpace() []string

	IsEqual(MetadataInterface) bool
}

MetadataInterface is the interface for field metadata. This is to make anyone can extend the DSV library including the metadata.

func FindMetadata

func FindMetadata(mdin MetadataInterface, mds []MetadataInterface) (
	idx int,
	mdout MetadataInterface,
)

FindMetadata Given a slice of metadata, find `mdin` in the slice which has the same name, ignoring metadata where Skip value is true. If found, return the index and metadata object of matched metadata name. If not found return -1 as index and nil in `mdout`.

type ReadWriter

type ReadWriter struct {
	Writer
	Reader
}

ReadWriter combine reader and writer.

func New

func New(config string, dataset any) (rw *ReadWriter, e error)

New create a new ReadWriter object.

func (*ReadWriter) Close

func (dsv *ReadWriter) Close() (e error)

Close reader and writer.

func (*ReadWriter) SetConfigPath

func (dsv *ReadWriter) SetConfigPath(dir string)

SetConfigPath of input and output file.

type Reader

type Reader struct {

	// Input file, mandatory.
	Input string `json:"Input"`

	// Rejected is the file name where row that does not fit
	// with metadata will be saved.
	Rejected string `json:"Rejected"`

	// DatasetMode define on how do you want the result is saved. There are
	// three options: either in "rows", "columns", or "matrix" mode.
	// For example, input data file,
	//
	//	a,b,c
	//	1,2,3
	//
	// "rows" mode is where each line saved in its own slice, resulting
	// in Rows:
	//
	//	[a b c]
	//	[1 2 3]
	//
	// "columns" mode is where each line saved by columns, resulting in
	// Columns:
	//
	//	[a 1]
	//	[b 2]
	//	[c 3]
	//
	// "matrix" mode is where each record saved in their own row and column.
	//
	DatasetMode string `json:"DatasetMode"`

	// Config define path of configuration file.
	//
	// If the configuration located in other directory, e.g.
	// "../../config.dsv", and the Input option is set with name only, like
	// "input.dat", we assume that its in the same directory where the
	// configuration file belong.
	Config

	// InputMetadata define format for each column in input data.
	InputMetadata []Metadata `json:"InputMetadata"`

	// Skip n lines from the head.
	Skip int `json:"Skip"`

	// MaxRows define maximum row that this reader will read and
	// saved in the memory at one read operation.
	// If the value is -1, all rows will read.
	MaxRows int `json:"MaxRows"`

	// TrimSpace or not. If its true, before parsing the line, the white
	// space in the beginning and end of each input line will be removed,
	// otherwise it will leave unmodified.  Default is true.
	TrimSpace bool `json:"TrimSpace"`
	// contains filtered or unexported fields
}

Reader hold all configuration, metadata and input data.

DSV Reader work like this,

(1) Initialize new dsv reader object

dsvReader, e := dsv.NewReader(configfile)

(2) Do not forget to check for error ...

if e != nil {

// handle error

}

(3) Make sure to close all files after finished

defer dsvReader.Close ()

(4) Create loop to read input data

for {

n, e := dsv.Read (dsvReader)

if e == io.EOF {
	break
}

(4.1) Iterate through rows

for row := range dsvReader.GetDataAsRows() {
	// work with row ...
}

}

Thats it.

func NewReader

func NewReader(config string, dataset any) (reader *Reader, e error)

NewReader create and initialize new instance of DSV Reader with default values.

func (*Reader) AddInputMetadata

func (reader *Reader) AddInputMetadata(md *Metadata)

AddInputMetadata add new input metadata to reader.

func (*Reader) AppendMetadata

func (reader *Reader) AppendMetadata(mdi MetadataInterface)

AppendMetadata will append new metadata `md` to list of reader input metadata.

func (*Reader) Close

func (reader *Reader) Close() (e error)

Close all open descriptors.

func (*Reader) CopyConfig

func (reader *Reader) CopyConfig(src *Reader)

CopyConfig copy configuration from other reader object not including data and metadata.

func (*Reader) FetchNextLine

func (reader *Reader) FetchNextLine(lastline []byte) (line []byte, e error)

FetchNextLine read the next line and combine it with the `lastline`.

func (*Reader) Flush

func (reader *Reader) Flush() error

Flush all output buffer.

func (*Reader) GetDataset

func (reader *Reader) GetDataset() any

GetDataset return reader dataset.

func (*Reader) GetDatasetMode

func (reader *Reader) GetDatasetMode() string

GetDatasetMode return output mode of data.

func (*Reader) GetInput

func (reader *Reader) GetInput() string

GetInput return the input file.

func (*Reader) GetInputMetadata

func (reader *Reader) GetInputMetadata() []MetadataInterface

GetInputMetadata return pointer to slice of metadata.

func (*Reader) GetInputMetadataAt

func (reader *Reader) GetInputMetadataAt(idx int) MetadataInterface

GetInputMetadataAt return pointer to metadata at index 'idx'.

func (*Reader) GetMaxRows

func (reader *Reader) GetMaxRows() int

GetMaxRows return number of maximum rows for reading.

func (*Reader) GetNColumnIn

func (reader *Reader) GetNColumnIn() int

GetNColumnIn return number of input columns, or number of metadata, including column with Skip=true.

func (*Reader) GetRejected

func (reader *Reader) GetRejected() string

GetRejected return name of rejected file.

func (*Reader) GetSkip

func (reader *Reader) GetSkip() int

GetSkip return number of line that will be skipped.

func (*Reader) Init

func (reader *Reader) Init(fcfg string, dataset any) (e error)

Init will initialize reader object by

(1) Check if dataset is not empty. (2) Read config file. (3) Set reader object default value. (4) Check if output mode is valid and initialize it if valid. (5) Check and initialize metadata and columns attributes. (6) Check if Input is name only without path, so we can prefix it with

config path.

(7) Open rejected file. (8) Open input file.

func (*Reader) IsEqual

func (reader *Reader) IsEqual(other *Reader) bool

IsEqual compare only the configuration and metadata with other instance.

func (*Reader) IsTrimSpace

func (reader *Reader) IsTrimSpace() bool

IsTrimSpace return value of TrimSpace option.

func (*Reader) MergeColumns

func (reader *Reader) MergeColumns(other ReaderInterface)

MergeColumns append metadata and columns from another reader if not exist in current metadata set.

func (*Reader) MergeRows

func (reader *Reader) MergeRows(other *Reader)

MergeRows append rows from another reader.

func (*Reader) Open

func (reader *Reader) Open() (e error)

Open input and rejected file.

func (*Reader) OpenInput

func (reader *Reader) OpenInput() (e error)

OpenInput open the input file, metadata must have been initialize.

func (*Reader) OpenRejected

func (reader *Reader) OpenRejected() (e error)

OpenRejected open rejected file, for saving unparseable line.

func (*Reader) ReadLine

func (reader *Reader) ReadLine() (line []byte, e error)

ReadLine will read one line from input file.

func (*Reader) Reject

func (reader *Reader) Reject(line []byte) (int, error)

Reject the line and save it to the reject file.

func (*Reader) Reset

func (reader *Reader) Reset() (e error)

Reset all variables for next read operation. Number of rows will be 0, and Rows will be empty again.

func (*Reader) SetDatasetMode

func (reader *Reader) SetDatasetMode(mode string)

SetDatasetMode to `mode`.

func (*Reader) SetDefault

func (reader *Reader) SetDefault()

SetDefault options for global config and each metadata.

func (*Reader) SetInput

func (reader *Reader) SetInput(path string)

SetInput file.

func (*Reader) SetMaxRows

func (reader *Reader) SetMaxRows(max int)

SetMaxRows will set maximum rows that will be read from input file.

func (*Reader) SetRejected

func (reader *Reader) SetRejected(path string)

SetRejected file.

func (*Reader) SetSkip

func (reader *Reader) SetSkip(n int)

SetSkip set number of lines that will be skipped before reading actual data.

func (*Reader) SkipLines

func (reader *Reader) SkipLines() (e error)

SkipLines skip parsing n lines from input file. The n is defined in the attribute "Skip"

type ReaderError

type ReaderError struct {
	// Func where error happened
	Func string

	// What cause the error?
	What string

	// Line define the line which cause error
	Line string

	// T define type of error.
	T int

	// Pos character position which cause error
	Pos int

	// N line number
	N int
}

ReaderError to handle error data and message.

func ParseLine

func ParseLine(reader ReaderInterface, line []byte) (
	prow *tabula.Row, eRead *ReaderError,
)

ParseLine parse a line containing records. The output is array of record (or single row).

This is how the algorithm works (1) create n slice of record, where n is number of column metadata (2) for each metadata (2.0) Check if the next sequence matched with separator. (2.0.1) If its match, create empty record (2.1) If using left quote, skip until we found left-quote (2.2) If using right quote, append byte to buffer until right-quote

(2.2.1) If using separator, skip until separator

(2.3) If using separator, append byte to buffer until separator (2.4) else append all byte to buffer. (3) save buffer to record

func ReadRow

func ReadRow(reader ReaderInterface, linenum int) (
	row *tabula.Row,
	line []byte,
	n int,
	eRead *ReaderError,
)

ReadRow read one line at a time until we get one row or error when parsing the data.

func (*ReaderError) Error

func (e *ReaderError) Error() string

Error to string.

type ReaderInterface

type ReaderInterface interface {
	ConfigInterface
	AddInputMetadata(*Metadata)
	AppendMetadata(MetadataInterface)
	GetInputMetadata() []MetadataInterface
	GetInputMetadataAt(idx int) MetadataInterface
	GetMaxRows() int
	SetMaxRows(max int)
	GetDatasetMode() string
	SetDatasetMode(mode string)
	GetNColumnIn() int
	GetInput() string
	SetInput(path string)
	GetRejected() string
	SetRejected(path string)
	GetSkip() int
	SetSkip(n int)
	IsTrimSpace() bool
	SetDefault()
	OpenInput() error
	OpenRejected() error
	SkipLines() error

	Reset() error
	Flush() error
	ReadLine() ([]byte, error)
	FetchNextLine([]byte) ([]byte, error)
	Reject(line []byte) (int, error)
	Close() error

	GetDataset() any
	MergeColumns(ReaderInterface)
}

ReaderInterface is the interface for reading DSV file.

func SimpleMerge

func SimpleMerge(fin1, fin2 string, dataset1, dataset2 any) (
	ReaderInterface,
	error,
)

SimpleMerge provide a shortcut to merge two dsv files using configuration files passed in parameters.

One must remember to set, - "MaxRows" to -1 to be able to read all rows, in both input configuration, and - "DatasetMode" to "columns" to speeding up process.

This function return the merged reader or error if failed.

func SimpleRead

func SimpleRead(fcfg string, dataset any) (
	reader ReaderInterface,
	e error,
)

SimpleRead provide a shortcut to read data from file using configuration file from `fcfg`. Return the reader contained data or error if failed. Reader object upon returned has been closed, so if one need to read all data in it simply set the `MaxRows` to `-1` in config file.

type Writer

type Writer struct {

	// BufWriter for buffered writer.
	BufWriter *bufio.Writer `json:"-"`

	Config `json:"-"`

	// Output file where the records will be written.
	Output string `json:"Output"`

	// OutputMetadata define format for each column.
	OutputMetadata []Metadata `json:"OutputMetadata"`
	// contains filtered or unexported fields
}

Writer write records from reader or slice using format configuration in metadata.

func NewWriter

func NewWriter(config string) (writer *Writer, e error)

NewWriter create a writer object. User must call Open after that to populate the output and metadata.

func (*Writer) AddMetadata

func (writer *Writer) AddMetadata(md Metadata)

AddMetadata will add new output metadata to writer.

func (*Writer) Close

func (writer *Writer) Close() (e error)

Close all open descriptor.

func (*Writer) Flush

func (writer *Writer) Flush() error

Flush output buffer to disk.

func (*Writer) GetOutput

func (writer *Writer) GetOutput() string

GetOutput return output filename.

func (*Writer) OpenOutput

func (writer *Writer) OpenOutput(file string) (e error)

OpenOutput file and buffered writer. File will be truncated if its exist.

func (*Writer) ReopenOutput

func (writer *Writer) ReopenOutput(file string) (e error)

ReopenOutput will open the output file back without truncating the content.

func (*Writer) SetOutput

func (writer *Writer) SetOutput(path string)

SetOutput will set the output file to path.

func (*Writer) String

func (writer *Writer) String() string

String yes, it will print it in JSON like format.

func (*Writer) Write

func (writer *Writer) Write(reader ReaderInterface) (int, error)

Write rows from Reader to file. Return n for number of row written, or e if error happened.

func (*Writer) WriteColumns

func (writer *Writer) WriteColumns(columns tabula.Columns,
	colMd []MetadataInterface,
) (
	n int,
	e error,
)

WriteColumns will write content of columns to output file. Return n for number of row written, and e if error happened.

func (*Writer) WriteRawColumns

func (writer *Writer) WriteRawColumns(cols *tabula.Columns, sep *string) (
	nrow int,
	e error,
)

WriteRawColumns write raw columns using separator `sep` for each record to file.

We use pointer in separator parameter, so we can use empty string as separator.

func (*Writer) WriteRawDataset

func (writer *Writer) WriteRawDataset(dataset tabula.DatasetInterface,
	sep *string,
) (
	int, error,
)

WriteRawDataset will write content of dataset to file without metadata but using separator `sep` for each record.

We use pointer in separator parameter, so we can use empty string as separator.

func (*Writer) WriteRawRow

func (writer *Writer) WriteRawRow(row *tabula.Row, sep, esc []byte) (e error)

WriteRawRow will write row data using separator `sep` for each record.

func (*Writer) WriteRawRows

func (writer *Writer) WriteRawRows(rows *tabula.Rows, sep *string) (
	nrow int,
	e error,
)

WriteRawRows write rows data using separator `sep` for each record. We use pointer in separator parameter, so we can use empty string as separator.

func (*Writer) WriteRow

func (writer *Writer) WriteRow(row *tabula.Row, recordMd []MetadataInterface) (
	e error,
)

WriteRow dump content of Row to file using format in metadata.

func (*Writer) WriteRows

func (writer *Writer) WriteRows(rows tabula.Rows, recordMd []MetadataInterface) (
	n int,
	e error,
)

WriteRows will loop each row in the list of rows and write their content to output file. Return n for number of row written, and e if error happened.

type WriterInterface

type WriterInterface interface {
	ConfigInterface
	GetOutput() string
	SetOutput(path string)
	OpenOutput(file string) error
	Flush() error
	Close() error
}

WriterInterface is an interface for writing DSV data to file.

Source Files

config.go configinterface.go dsv.go dsvinterface.go metadata.go metadatainterface.go reader.go readererror.go readerinterface.go writer.go writerinterface.go

Version
v0.60.0 (latest)
Published
Feb 1, 2025
Platform
linux/amd64
Imports
12 packages
Last checked
6 minutes ago

Tools for package owners.