From: Sergey Matveev Date: Mon, 24 Nov 2025 16:41:52 +0000 (+0300) Subject: Slightly more compact BLOB X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=183ea2a5be76ecec3c8524128f124f8ce76bb8492539037e598e1cb463300791;p=keks.git Slightly more compact BLOB --- diff --git a/c/lib/dec.c b/c/lib/dec.c index c726d1d..a831f7a 100644 --- a/c/lib/dec.c +++ b/c/lib/dec.c @@ -167,11 +167,20 @@ KEKSAtomDecode( // NOLINT(misc-no-recursion) break; case KEKSAtomBlob: { atom->typ = KEKSItemBlob; - (*got) += 8; - if (len < (*got)) { - return KEKSErrNotEnough; + if (buf[1] != KEKSAtomPint) { + return KEKSErrBlobBadAtom; + } + size_t pintGot = 0; + struct KEKSAtom pint = {0}; + enum KEKSErr err = KEKSAtomDecode(&pintGot, &pint, buf + 1, len - 1); + if (err != KEKSErrNo) { + return err; } - const uint64_t chunkLen = keksFromBE(buf + 1, 8); + (*got) += pintGot; + if (pint.typ != KEKSItemPint) { + return KEKSErrBlobBadAtom; + } + const uint64_t chunkLen = pint.v.pint; if (chunkLen > (SIZE_MAX - 1)) { return KEKSErrLenTooBig; } @@ -182,11 +191,11 @@ KEKSAtomDecode( // NOLINT(misc-no-recursion) case KEKSAtomPint: case KEKSAtomNint: { atom->typ = (tag == KEKSAtomPint) ? KEKSItemPint : KEKSItemNint; - size_t binGot = 0; - struct KEKSAtom bin = {0}; if ((buf[1] & (unsigned char)KEKSAtomStrings) == 0) { return KEKSErrIntNonBin; } + size_t binGot = 0; + struct KEKSAtom bin = {0}; enum KEKSErr err = KEKSAtomDecode(&binGot, &bin, buf + 1, len - 1); if (err != KEKSErrNo) { return err; diff --git a/c/lib/enc.c b/c/lib/enc.c index 683a9e3..201dc3c 100644 --- a/c/lib/enc.c +++ b/c/lib/enc.c @@ -256,14 +256,16 @@ KEKSAtomBlobEncode( const size_t chunkLen) { assert(len != NULL); - (*len) = 1 + 8; - if (cap < 1 + 8) { + assert(chunkLen != 0); + (*len) = 1; + if (cap <= 1) { return false; } assert(buf != NULL); buf[0] = KEKSAtomBlob; - keksToBE(buf + 1, 8, (uint64_t)chunkLen - 1); - return true; + bool ok = KEKSAtomUintEncode(len, buf + 1, cap - 1, chunkLen - 1); + (*len)++; + return ok; } static bool diff --git a/go/atom-decode.go b/go/atom-decode.go index 0138409..cd97983 100644 --- a/go/atom-decode.go +++ b/go/atom-decode.go @@ -17,7 +17,6 @@ package keks import ( "errors" - "math/big" "strings" "unicode/utf8" "unsafe" @@ -38,6 +37,7 @@ var ( ErrTAINonMinimal = errors.New("non-minimal TAI64") ErrTAITooManyNsecs = errors.New("too many nanoseconds") ErrTAITooManyAsecs = errors.New("too many attoseconds") + ErrBlobBadInt = errors.New("blob with non Pint") ) func (ctx *Decoder) DecodeAtom() (t types.Type, err error) { @@ -98,12 +98,26 @@ func (ctx *Decoder) DecodeAtom() (t types.Type, err error) { case AtomMap: t = types.Map case AtomBLOB: - var s string - s, err = ctx.getBytes(8) + tag, err = ctx.getByte() + if err != nil { + return + } + if AtomType(tag) != AtomPInt { + err = ErrBlobBadInt + return + } + var isBig bool + isBig, err = ctx.getInt(types.UInt) if err != nil { return } - chunkLen := be.Get([]byte(s)) + if isBig { + ctx.bigints = ctx.bigints[:len(ctx.bigints)-1] + err = ErrLenTooBig + return + } + chunkLen := ctx.uints[len(ctx.uints)-1] + ctx.uints = ctx.uints[:len(ctx.uints)-1] if chunkLen >= (1<<63)-1 { err = ErrLenTooBig return @@ -117,54 +131,13 @@ func (ctx *Decoder) DecodeAtom() (t types.Type, err error) { } else { t = types.Int } - tag, err = ctx.getByte() - if err != nil { - return - } - if tag&AtomStrings == 0 || tag&AtomIsUTF8 != 0 { - err = ErrIntBad - return - } - var s string - s, err = ctx.getStr(tag) + var isBig bool + isBig, err = ctx.getInt(t) if err != nil { return } - if len(s) == 0 { - if t == types.UInt { - ctx.uints = append(ctx.uints, 0) - } else { - ctx.ints = append(ctx.ints, -1) - } - break - } - if s[0] == 0 { - err = ErrIntNonMinimal - return - } - if len(s) > 8 { - bi := new(big.Int).SetBytes([]byte(s)) - if t == types.Int { - bi = bi.Add(bi, big.NewInt(1)) - bi = bi.Neg(bi) - } + if isBig { t = types.BigInt - ctx.bigints = append(ctx.bigints, bi) - break - } - i := be.Get([]byte(s)) - if t == types.UInt { - ctx.uints = append(ctx.uints, i) - } else { - if i >= (1 << 63) { - bi := new(big.Int).SetBytes([]byte(s)) - bi = bi.Add(bi, big.NewInt(1)) - bi = bi.Neg(bi) - ctx.bigints = append(ctx.bigints, bi) - t = types.BigInt - } else { - ctx.ints = append(ctx.ints, -1-int64(i)) - } } case AtomFloatNaN: t = types.Float diff --git a/go/atom-encode.go b/go/atom-encode.go index 7987c4f..74dd91d 100644 --- a/go/atom-encode.go +++ b/go/atom-encode.go @@ -155,10 +155,13 @@ func FloatEncode(w io.Writer, v *Float) (written int64, err error) { // Write an encoded BLOB atom. func BlobAtomEncode(w io.Writer, chunkLen int64) (written int64, err error) { - l := make([]byte, 9) - l[0] = byte(AtomBLOB) - be.Put(l[1:], uint64(chunkLen-1)) - return io.Copy(w, bytes.NewReader(l)) + _, err = io.Copy(w, bytes.NewReader([]byte{byte(AtomBLOB), byte(AtomPInt)})) + if err != nil { + return + } + written, err = atomUintEncode(w, uint64(chunkLen-1)) + written += 2 + return } // Write an encoded BLOB. diff --git a/go/blob_test.go b/go/blob_test.go index d8d5114..cbf4b2a 100644 --- a/go/blob_test.go +++ b/go/blob_test.go @@ -18,16 +18,15 @@ package keks import ( "bytes" + "encoding/hex" "io" "testing" "testing/quick" - - "go.cypherpunks.su/keks/be" ) func TestBlobMultipleOfChunkLen(t *testing.T) { bin := bytes.Join([][]byte{ - mustHexDec("0B0000000000000003"), + mustHexDec("0B0C8103"), {0x84}, []byte("test"), {0x84}, @@ -43,7 +42,7 @@ func TestBlobMultipleOfChunkLen(t *testing.T) { t.Fatal(err) } if !bytes.Equal(encoded, bin) { - t.Fatal("encoded differs") + t.Fatal("encoded differs", hex.EncodeToString(encoded), hex.EncodeToString(bin)) } decoder := NewDecoderFromBytes(append(encoded, Junk...), nil) decoded, err := decoder.Decode() @@ -83,7 +82,7 @@ func TestBlobMultipleOfChunkLen(t *testing.T) { func TestBlobLargerOfChunkLen(t *testing.T) { bin := bytes.Join([][]byte{ - mustHexDec("0B0000000000000003"), + mustHexDec("0B0C8103"), {0x84}, []byte("test"), {0x84}, @@ -142,7 +141,7 @@ func TestBlobLargerOfChunkLen(t *testing.T) { } func TestBlobEmpty(t *testing.T) { - bin := mustHexDec("0B0000000000000003" + "80") + bin := mustHexDec("0B0C8103" + "80") encoded, err := EncodeBuf(BlobReader{ ChunkLen: 4, R: bytes.NewReader(nil), @@ -241,7 +240,7 @@ func TestBlobSymmetric(t *testing.T) { func TestBlobNotEnoughData(t *testing.T) { bin := bytes.Join([][]byte{ - mustHexDec("0B0000000000000003"), + mustHexDec("0B0C8103"), {0x84}, []byte("test"), {0x84}, @@ -254,17 +253,17 @@ func TestBlobNotEnoughData(t *testing.T) { } func TestBlobTooLong(t *testing.T) { - bin := make([]byte, 1+8) - bin[0] = byte(AtomBLOB) - be.Put(bin[1:], (1<<63)-1) - _, err := NewDecoderFromBytes(bin, nil).Decode() + var buf bytes.Buffer + buf.Write([]byte{byte(AtomBLOB)}) + UIntEncode(&buf, (1<<63)-1) + _, err := NewDecoderFromBytes(buf.Bytes(), nil).Decode() if err != ErrLenTooBig { t.Fatal(err) } } func TestBlobNotEnoughDataForLength(t *testing.T) { - bin := mustHexDec("0B00000000") + bin := mustHexDec("0B0C81") _, err := NewDecoderFromBytes(bin, nil).Decode() if err != io.ErrUnexpectedEOF { t.Fatal(err) @@ -273,7 +272,7 @@ func TestBlobNotEnoughDataForLength(t *testing.T) { func TestBlobWrongTerminatorLength(t *testing.T) { bin := bytes.Join([][]byte{ - mustHexDec("0B0000000000000003"), + mustHexDec("0B0C8103"), {0x84}, []byte("test"), {0x84}, @@ -289,7 +288,7 @@ func TestBlobWrongTerminatorLength(t *testing.T) { func TestBlobWrongTerminatorTag(t *testing.T) { bin := bytes.Join([][]byte{ - mustHexDec("0B0000000000000003"), + mustHexDec("0B0C8103"), {0x84}, []byte("test"), {0x84}, @@ -304,10 +303,13 @@ func TestBlobWrongTerminatorTag(t *testing.T) { } func TestBlobTooDeep(t *testing.T) { - bin := []byte{byte(AtomBLOB)} - bin = append(bin, bytes.Repeat([]byte{0x01}, 8)...) - bin = append(bin, bytes.Repeat([]byte{byte(AtomList)}, 1000)...) - if _, err := NewDecoderFromBytes(bin, nil).Decode(); err != ErrBlobBadAtom { + var buf bytes.Buffer + buf.Write([]byte{byte(AtomBLOB)}) + UIntEncode(&buf, 1) + for range 1000 { + buf.Write([]byte{byte(AtomList)}) + } + if _, err := NewDecoderFromBytes(buf.Bytes(), nil).Decode(); err != ErrBlobBadAtom { t.Fatal(err) } } diff --git a/go/int.go b/go/int.go new file mode 100644 index 0000000..cdffc12 --- /dev/null +++ b/go/int.go @@ -0,0 +1,77 @@ +// KEKS -- Go KEKS codec implementation +// Copyright (C) 2024-2025 Sergey Matveev +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as +// published by the Free Software Foundation, version 3 of the License. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this program. If not, see . + +package keks + +import ( + "math/big" + + "go.cypherpunks.su/keks/be" + "go.cypherpunks.su/keks/types" +) + +func (ctx *Decoder) getInt(t types.Type) (isBig bool, err error) { + var tag byte + tag, err = ctx.getByte() + if err != nil { + return + } + if tag&AtomStrings == 0 || tag&AtomIsUTF8 != 0 { + err = ErrIntBad + return + } + var s string + s, err = ctx.getStr(tag) + if err != nil { + return + } + if len(s) == 0 { + if t == types.UInt { + ctx.uints = append(ctx.uints, 0) + } else { + ctx.ints = append(ctx.ints, -1) + } + return + } + if s[0] == 0 { + err = ErrIntNonMinimal + return + } + if len(s) > 8 { + isBig = true + bi := new(big.Int).SetBytes([]byte(s)) + if t == types.Int { + bi = bi.Add(bi, big.NewInt(1)) + bi = bi.Neg(bi) + } + ctx.bigints = append(ctx.bigints, bi) + return + } + i := be.Get([]byte(s)) + if t == types.UInt { + ctx.uints = append(ctx.uints, i) + } else { + if i >= (1 << 63) { + isBig = true + bi := new(big.Int).SetBytes([]byte(s)) + bi = bi.Add(bi, big.NewInt(1)) + bi = bi.Neg(bi) + ctx.bigints = append(ctx.bigints, bi) + } else { + ctx.ints = append(ctx.ints, -1-int64(i)) + } + } + return +} diff --git a/py3/keks.py b/py3/keks.py index d0e2ee6..0cad765 100755 --- a/py3/keks.py +++ b/py3/keks.py @@ -323,7 +323,7 @@ def dumps(v): if isinstance(v, Blob): assert (v.l > 0) and (v.l <= (1 << 64)) l, v = v.l, v.v - raws = [TagBlobb, (l-1).to_bytes(8, "big")] + raws = [TagBlobb, dumps(l-1)] append = raws.append chunks = len(v) // l for i in range(chunks): @@ -527,10 +527,12 @@ def _loads(v, sets=False, leapsecUTCAllow=False, _allowContainers=True): ret = set(ret.keys()) return ret, v if b == TagBlob: - if len(v) < 1+8: - raise NotEnoughData(1+8-len(v)) - l = 1 + int.from_bytes(v[1:1+8], "big") - v = v[1+8:] + if len(v) < 3: + raise NotEnoughData(3-len(v)) + if v[1] != TagPInt: + raise DecodeError("blob without Pint") + l, v = _loads(v[1:]) + l += 1 raws = [] while True: i, v = _loads(v, _allowContainers=False) diff --git a/py3/tests/test_blob.py b/py3/tests/test_blob.py index 2ee62ec..b080581 100644 --- a/py3/tests/test_blob.py +++ b/py3/tests/test_blob.py @@ -36,7 +36,7 @@ class TestBlob(TestCase): self.assertSequenceEqual( encoded, b"".join(( - bytes.fromhex("0B0000000000000003"), + bytes.fromhex("0B0C8103"), bytes.fromhex("84"), b"test", bytes.fromhex("84"), b"data", bytes.fromhex("80"), @@ -54,7 +54,7 @@ class TestBlob(TestCase): self.assertSequenceEqual( encoded, b"".join(( - bytes.fromhex("0B0000000000000003"), + bytes.fromhex("0B0C8103"), bytes.fromhex("84"), b"test", bytes.fromhex("84"), b"data", bytes.fromhex("81"), b"2", @@ -72,7 +72,7 @@ class TestBlob(TestCase): self.assertSequenceEqual( encoded, b"".join(( - bytes.fromhex("0B0000000000000003"), + bytes.fromhex("0B0C8103"), bytes.fromhex("80"), )), ) @@ -90,7 +90,7 @@ class TestBlob(TestCase): chunks = [urandom(chunkLen) for _ in range(chunks)] encoded = b"".join(( b"\x0b", - (chunkLen-1).to_bytes(8, "big"), + dumps(chunkLen-1), b"".join(dumps(chunk) for chunk in chunks), b"\x80", junk, @@ -102,7 +102,7 @@ class TestBlob(TestCase): def test_throws_when_not_enough_data(self) -> None: encoded = b"".join(( - bytes.fromhex("0B0000000000000003"), + bytes.fromhex("0B0C8103"), bytes.fromhex("84"), b"test", bytes.fromhex("84"), b"da", )) @@ -111,14 +111,14 @@ class TestBlob(TestCase): self.assertEqual(err.exception.n, 2) def test_throws_when_not_enough_data_for_length(self) -> None: - encoded = bytes.fromhex("0B00000000") + encoded = bytes.fromhex("0B0C81") with self.assertRaises(NotEnoughData) as err: loads(encoded) - self.assertEqual(err.exception.n, 8-4) + self.assertEqual(err.exception.n, 1) def test_throws_when_wrong_terminator_length(self) -> None: encoded = b"".join(( - bytes.fromhex("0B0000000000000003"), + bytes.fromhex("0B0C8103"), bytes.fromhex("84"), b"test", bytes.fromhex("84"), b"data", bytes.fromhex("8A"), b"terminator", @@ -129,7 +129,7 @@ class TestBlob(TestCase): def test_throws_when_wrong_terminator_tag(self) -> None: encoded = b"".join(( - bytes.fromhex("0B0000000000000003"), + bytes.fromhex("0B0C8103"), bytes.fromhex("84"), b"test", bytes.fromhex("84"), b"data", bytes.fromhex("04"), b"that was a wrong tag", diff --git a/py3/tests/test_recursion.py b/py3/tests/test_recursion.py index da628b0..b17fc4c 100644 --- a/py3/tests/test_recursion.py +++ b/py3/tests/test_recursion.py @@ -18,6 +18,7 @@ from unittest import TestCase from keks import _byte from keks import DecodeError +from keks import dumps from keks import loads from keks import TagBlob from keks import TagList @@ -34,7 +35,7 @@ class TestTooDeepInt(TestCase): class TestTooDeepBlob(TestCase): def runTest(self) -> None: with self.assertRaises(DecodeError) as err: - loads(_byte(TagBlob) + (8 * b"\x01") + _byte(TagList) * 1000) + loads(_byte(TagBlob) + dumps(1) + _byte(TagList) * 1000) self.assertEqual(str(err.exception), "unknown tag") diff --git a/spec/encoding/BLOB b/spec/encoding/BLOB index e7eb48d..22dddc1 100644 --- a/spec/encoding/BLOB +++ b/spec/encoding/BLOB @@ -1,8 +1,8 @@ BLOB (binary large object) allows you to transfer binary data in chunks, in a streaming way, when data may not fit in memory. -64-bit big-endian integer follows the BLOB tag, setting the following -chunks payload size (+1). Then come one or more binary [encoding/String] +[encoding/INT] follows the BLOB tag, setting the following chunks +payload size, plus one. Then comes one or more binary [encoding/String] with the chunk-length payload. All of them, except for the last one, must have fixed chunk length payload. Last terminating string's payload must be shorter. @@ -10,12 +10,12 @@ payload must be shorter. Data format definition must specify exact chunk size expected to be used, if it needs deterministic encoding. - BLOB chunk-len [BIN(len=chunk-len) || ...] BIN(len