2014/06/07

BinaryWriter.Write(string) と StreamWriter.Write(string)

いろいろ違うらしい。以下のようなテストコード作成していろいろ確認してみた。

using System;
using System.Linq;
using System.Text;

namespace ConsoleApplication3
{
    class Program
    {
        static void Main(string[] args)
        {
            var str = "Hello World!";

            Console.WriteLine("Input = {0}", str);

            Console.WriteLine("");
            Console.WriteLine("GetBytes (UTF8 with BOM)");
            DisplayBytes(GetBytes(str, true));

            Console.WriteLine("");
            Console.WriteLine("GetBytes (UTF8 without BOM)");
            DisplayBytes(GetBytes(str, false));

            Console.WriteLine("");
            Console.WriteLine("BinaryWriter (UTF8 with BOM)");
            DisplayBytes(BinaryWriter(str, true));

            Console.WriteLine("");
            Console.WriteLine("BinaryWriter(UTF8 without BOM)");
            DisplayBytes(BinaryWriter(str, false));

            Console.WriteLine("");
            Console.WriteLine("StreamWriter (UTF8 with BOM)");
            DisplayBytes(StreamWriter(str, true));

            Console.WriteLine("");
            Console.WriteLine("StreamWriter (UTF8 without BOM)");
            DisplayBytes(StreamWriter(str, false));

            Console.WriteLine("");
            Console.WriteLine("StreamWriter2 (UTF8 with BOM)");
            DisplayBytes(StreamWriter2(str, true));

            var longString = new String('A', 0x100);

            Console.WriteLine("");
            Console.WriteLine("BinaryWriter (long string, length: 0x{0:X})", longString.Length);
            var bwlong = BinaryWriter(longString, true);
            DisplayBytes(bwlong);
            int size;
            Console.WriteLine(
                "DecodeLEB128: Value = 0x{0:X}, Size = {1}",
                DecodeLEB128(bwlong, out size),
                size);

            Console.ReadKey();
        }

        static byte[] GetBytes(string str, bool withBOM)
        {
            return new UTF8Encoding(withBOM).GetBytes(str);
        }

        static byte[] BinaryWriter(string str, bool withBOM)
        {
            using (var ms = new System.IO.MemoryStream())
            {
                var writer = new System.IO.BinaryWriter(ms, new UTF8Encoding(withBOM));
                writer.Write(str);
                writer.Flush();

                return ms.ToArray();
            }
        }

        static byte[] StreamWriter(string str, bool withBOM)
        {
            using (var ms = new System.IO.MemoryStream())
            {
                var writer = new System.IO.StreamWriter(ms, new UTF8Encoding(withBOM));
                writer.Write(str);
                writer.Flush();

                return ms.ToArray();
            }
        }

        static byte[] StreamWriter2(string str, bool withBOM)
        {
            using (var ms = new System.IO.MemoryStream())
            {
                ms.Seek(1, System.IO.SeekOrigin.Begin);

                var writer = new System.IO.StreamWriter(ms, new UTF8Encoding(withBOM));
                writer.Write(str);
                writer.Flush();

                return ms.ToArray();
            }
        }

        static void DisplayBytes(byte[] bytes)
        {
            Console.WriteLine(string.Join(" ", bytes.Select<byte, string>(x => x.ToString("X2"))));
        }

        static int DecodeLEB128(byte[] bytes, out int size)
        {
            int result = 0;
            int shift = 0;
            size = 0;

            while (true)
            {
                var b = bytes[size];
                size++;
                result |= (b & 0x7F) << shift;
                if ((b & 0x80) == 0)
                {
                    break;
                }
                shift += 7;
            }

            return result;
        }
    }
}

結果

Input = Hello World!

GetBytes (UTF8 with BOM)
48 65 6C 6C 6F 20 57 6F 72 6C 64 21

GetBytes (UTF8 without BOM)
48 65 6C 6C 6F 20 57 6F 72 6C 64 21

BinaryWriter (UTF8 with BOM)
0C 48 65 6C 6C 6F 20 57 6F 72 6C 64 21

BinaryWriter(UTF8 without BOM)
0C 48 65 6C 6C 6F 20 57 6F 72 6C 64 21

StreamWriter (UTF8 with BOM)
EF BB BF 48 65 6C 6C 6F 20 57 6F 72 6C 64 21

StreamWriter (UTF8 without BOM)
48 65 6C 6C 6F 20 57 6F 72 6C 64 21

StreamWriter2 (UTF8 with BOM)
00 48 65 6C 6C 6F 20 57 6F 72 6C 64 21

BinaryWriter (long string, length: 0x100)
80 02 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41
 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 4
1 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41
41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41
 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 4
1 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41
41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41
 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 4
1 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41
41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41
DecodeLEB128: Value = 0x100, Size = 2

わかったこと

  • Encoding.GetBytes、BinaryWriter.Write(string)、StreamWriter.Write(string) のすべてで null 終端文字は書き込まれない。
  • BinaryWriter.Write(string) は、文字列データの先頭に文字長プリフィックスなるものがつくらしい。
    • 文字長プリフィックスは LEB128 というフォーマットで文字長を記録しているらしい。
  • StreamWriter,Write(string) は UTF8Encoding の BOM が有効な場合で、かつストリーム位置が先頭のときに限り BOM を書き込む。

気をつけないと予期しない結果になりそうでありました。

0 件のコメント:

コメントを投稿