[teklib-general] changeset in /hg/teklib/current: added

Franciska Schulze fschulze at neoscientists.org
Mon Nov 26 16:25:53 CET 2007


changeset 11e379f742e4 in /hg/teklib/current
details: http://teklib.org:8001/hg/teklib/current?cmd=changeset;node=11e379f742e4
description:
	added

diffs (268 lines):

diff -r 8215ea066ca1 -r 11e379f742e4 src/display_x11/utf8-tools.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/display_x11/utf8-tools.c	Mon Oct 01 17:30:22 2007 +0200
@@ -0,0 +1,264 @@
+
+/*
+**	UTF-8 encoder / decoder
+**	Written by Timm S. Mueller
+**	Placed in the Public Domain
+**
+**	References:
+**	http://www.cl.cam.ac.uk/~mgk25/unicode.html
+**	http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
+*/
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+/*****************************************************************************/
+
+/*
+**	Encode unicode char (31bit) to UTF-8 (up to 6 chars)
+**	Reserve AT LEAST 6 bytes free space in the destination buffer
+*/
+
+unsigned char *encodeutf8(unsigned char *buf, int c)
+{
+	if (c < 128)
+	{
+		*buf++ = c;
+	}
+	else if (c < 2048)
+	{
+		*buf++ = 0xc0 + (c >> 6);
+		*buf++ = 0x80 + (c & 0x3f);
+	}
+	else if (c < 65536)
+	{
+		*buf++ = 0xe0 + (c >> 12);
+		*buf++ = 0x80 + ((c & 0xfff) >> 6);
+		*buf++ = 0x80 + (c & 0x3f);
+	}
+	else if (c < 2097152)
+	{
+		*buf++ = 0xf0 + (c >> 18);
+		*buf++ = 0x80 + ((c & 0x3ffff) >> 12);
+		*buf++ = 0x80 + ((c & 0xfff) >> 6);
+		*buf++ = 0x80 + (c & 0x3f);
+	}
+	else if (c < 67108864)
+	{
+		*buf++ = 0xf8 + (c >> 24);
+		*buf++ = 0x80 + ((c & 0xffffff) >> 18);
+		*buf++ = 0x80 + ((c & 0x3ffff) >> 12);
+		*buf++ = 0x80 + ((c & 0xfff) >> 6);
+		*buf++ = 0x80 + (c & 0x3f);
+	}
+	else
+	{
+		*buf++ = 0xfc + (c >> 30);
+		*buf++ = 0x80 + ((c & 0x3fffffff) >> 24);
+		*buf++ = 0x80 + ((c & 0xffffff) >> 18);
+		*buf++ = 0x80 + ((c & 0x3ffff) >> 12);
+		*buf++ = 0x80 + ((c & 0xfff) >> 6);
+		*buf++ = 0x80 + (c & 0x3f);
+	}
+	return buf;
+}
+
+/*****************************************************************************/
+
+/*
+**	UTF-8 reader
+**	Note: You must supply length, the reader treats 0 as a valid char!
+**
+**		struct readstringdata rs;
+**		struct utf8reader rd;
+**		int c;
+**
+**		rs.src = utf-8-string
+**		rs.srclen = length_in_bytes
+**
+**		rd.readchar = readstring;
+**		rd.accu = 0;
+**		rd.numa = 0;
+**		rd.bufc = -1;
+**		rd.udata = &rs;
+**
+**		while ((c = readutf8(&rd)) >= 0)
+**		{
+**			// unicode charcode (31bit) in c
+**		}
+*/
+
+
+struct utf8reader
+{
+	/* character reader callback: */
+	int (*readchar)(struct utf8reader *);
+	/* reader state: */
+	int accu, numa, min, bufc;
+	/* userdata to reader */
+	void *udata;
+};
+
+
+struct readstringdata
+{
+	/* src string: */
+	const unsigned char *src;
+	/* src string length: */
+	size_t srclen;
+};
+
+
+int readstring(struct utf8reader *rd)
+{
+	struct readstringdata *ud = rd->udata;
+	if (ud->srclen == 0)
+		return -1;
+	ud->srclen--;
+	return *ud->src++;
+}
+
+
+int readutf8(struct utf8reader *rd)
+{
+	int c;
+	for (;;)
+	{
+		if (rd->bufc >= 0)
+		{
+			c = rd->bufc;
+			rd->bufc = -1;
+		}
+		else
+			c = rd->readchar(rd);
+		if (c < 0)
+			return c;
+
+		if (c == 254 || c == 255)
+			break;
+
+		if (c < 128)
+		{
+			if (rd->numa > 0)
+			{
+				rd->bufc = c;
+				break;
+			}
+			return c;
+		}
+		else if (c < 192)
+		{
+			if (rd->numa == 0)
+				break;
+			rd->accu <<= 6;
+			rd->accu += c - 128;
+			rd->numa--;
+			if (rd->numa == 0)
+			{
+				if (rd->accu == 0 || rd->accu < rd->min ||
+					(rd->accu >= 55296 && rd->accu <= 57343))
+					break;
+				c = rd->accu;
+				rd->accu = 0;
+				return c;
+			}
+		}
+		else
+		{
+			if (rd->numa > 0)
+			{
+				rd->bufc = c;
+				break;
+			}
+
+			if (c < 224)
+			{
+				rd->min = 128;
+				rd->accu = c - 192;
+				rd->numa = 1;
+			}
+			else if (c < 240)
+			{
+				rd->min = 2048;
+				rd->accu = c - 224;
+				rd->numa = 2;
+			}
+			else if (c < 248)
+			{
+				rd->min = 65536;
+				rd->accu = c - 240;
+				rd->numa = 3;
+			}
+			else if (c < 252)
+			{
+				rd->min = 2097152;
+				rd->accu = c - 248;
+				rd->numa = 4;
+			}
+			else
+			{
+				rd->min = 67108864;
+				rd->accu = c - 252;
+				rd->numa = 5;
+			}
+		}
+	}
+	/* bad char */
+	rd->accu = 0;
+	rd->numa = 0;
+	return 65533;
+}
+
+#if defined(TEST)
+/*****************************************************************************/
+/*
+**	Test usage:
+**	# wget http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
+**	# ./utf8test < UTF-8-test.txt > out.txt
+*/
+
+struct readfiledata
+{
+	FILE *file;
+};
+
+
+int readfile(struct utf8reader *rd)
+{
+	struct readfiledata *ud = rd->udata;
+	return fgetc(ud->file);
+}
+
+
+int main(int argc, char **argv)
+{
+	struct readfiledata rs;
+	struct utf8reader rd;
+	int c, ln = 1;
+	char outbuf[6], *bufend;
+
+	rs.file = stdin;
+
+	rd.readchar = readfile;
+	rd.accu = 0;
+	rd.numa = 0;
+	rd.bufc = -1;
+	rd.udata = &rs;
+
+	while ((c = readutf8(&rd)) >= 0)
+	{
+		/* unicode charcode (31bit) in c */
+		if (c == 65533)
+			fprintf(stderr, "Bad UTF-8 encoding / char in line %d\n", ln);
+		bufend = encodeutf8(outbuf, c);
+		fwrite(outbuf, bufend - outbuf, 1, stdout);
+		if (c == 10)
+			ln++;
+	}
+
+	return 1;
+}
+#endif
+
+


More information about the teklib-general mailing list