Linux kernel patch for supporting CAP-compatible coding system for all filesystems (2004/12/23)

[Japanese]

This is a patch for Linux version 2.4.26 to support CAP-compatible coding system for all filesystems. Due to compatibility issues and also historical reasons, using 8-bit characters in pathname is sometimes bad idea.

For alleviating this problem, several coding systems are supported. For instance, VFAT and ealier version of NTFS filesystems support uni_xlate option, which converts unhandled Unicode characters as escape characeter ":" followed by three hexadecimal numbers. CAP (Columbia AppleTalk Program), a part of netatalk (http://netatalk.sourceforge.net/), supports its own coding system called cap, which is similar to uni_xlate, but slightly different. With cap encoding system, non-ASCII characeters are encoded as escape characeter ":" followed by two hexadecimal numbers. cap encoding system is also supported by samba (http://us1.samba.org/samba/).

The following patch is forcing CP932 codepage to use cap coding system. Since this patch is applied to Linux kernel's NLS (National Language Support), any filesystems tha utilized Linux kernel's NLS benefit from this patch.

I have tested this patch on Linux system running on Intel Pentium III processor with NTFS filesytes. You can download the patch from http://www.lsnl.jp/~ohsaki/research/linux-tips/nls-cp932/linux-2.4.26-nls_932-patch.diff

  --- fs/nls/nls_cp932.c.orig   Sat Dec 22 02:41:56 2001
  +++ fs/nls/nls_cp932.c        Thu Dec 23 19:28:11 2004
  @@ -7883,6 +7883,107 @@
                return -EINVAL;
   }
   
  +/* Dec 23, 2004 by Hiroyuki Ohsaki */
  +#if 1
  +#define XLATE_BUFSIZ (4096)
  +
  +/* In CAP-compatible format, escape all 8-bit characters in SRC, and
  +   write the result to DST.  The maximum length of DST is limited by
  +   DST_LEN.  Return the length of DST.  */
  +static int cap_escape(unsigned char *src, unsigned char *dst, int dst_len)
  +{
  +     unsigned char *p, *q;
  +     unsigned char *dec2hex = "0123456789abcdef";
  +
  +     for (p = src, q = dst; *p != '\0' && ((int) (q - dst)) < dst_len; p++) {
  +             if (*p >= 0x80) {
  +                     *q++ = ':';
  +                     *q++ = dec2hex[*p >> 4];
  +                     *q++ = dec2hex[*p & 0xf];
  +             } else {
  +                     *q++ = *p;
  +             }
  +     }
  +     *q = '\0';
  +     return (q - dst);
  +}
  +
  +static int hex2dec(unsigned char c)
  +{
  +     return (c >= 'a') ? (c - 'a' + 10) : (c - '0');
  +}
  +
  +/* Decode all escaped characters in CAP-compatible format in SRC, and
  +   write the result to DST.  The maximum length of DST is limited by
  +   DST_LEN.  Return the length of DST. */
  +static int cap_unescape(unsigned char *src, unsigned char *dst, int dst_len)
  +{
  +     unsigned char *p, *q;
  +
  +     for (p = src, q = dst; *p != '\0' && ((int) (q - dst)) < dst_len; p++) {
  +             if (*p == ':') {
  +                     *q++ = (hex2dec(*++p) << 4) | hex2dec(*++p);
  +             } else {
  +                     *q++ = *p;
  +             }
  +     }
  +     *q = '\0';
  +     return (q - dst);
  +}
  +
  +static int uni2char_xlate(const wchar_t uni,
  +                       unsigned char *out, int boundlen)
  +{
  +     /* FIXME: boundlen must be smaller than XLATE_BUFSIZ */
  +     unsigned char buf[XLATE_BUFSIZ], buf_xlate[XLATE_BUFSIZ];
  +     int i, outlen, buflen;
  +
  +     /* convert Unicode character to CP932 string */
  +     outlen = uni2char(uni, out, boundlen);
  +     if (outlen <= 0)
  +             return outlen;
  +     /* escape 8-bit characters */
  +     memcpy(buf, out, outlen);
  +     buf[outlen] = '\0';
  +     buflen = cap_escape(buf, buf_xlate, XLATE_BUFSIZ);
  +     if (boundlen < buflen)
  +             return -ENAMETOOLONG;
  +     /* restore to OUT */
  +     for (i = 0; i < buflen; i++)
  +             out[i] = buf_xlate[i];
  +     return buflen;
  +}
  +
  +static int char2uni_xlate(const unsigned char *rawstring, int boundlen,
  +                 wchar_t *uni)
  +{
  +     /* FIXME: boundlen must be smaller than XLATE_BUFSIZ */
  +     unsigned char buf[XLATE_BUFSIZ], buf_xlate[XLATE_BUFSIZ];
  +     int i, unilen, buflen;
  +
  +     /* decode CAP-formatted characters */
  +     memcpy(buf_xlate, rawstring, boundlen);
  +     buf_xlate[boundlen] = '\0';
  +     buflen = cap_unescape(buf_xlate, buf, XLATE_BUFSIZ);
  +     /* convert CP932 string to Unicode character */
  +     unilen = char2uni(buf, buflen, uni);
  +     if (unilen <= 0)
  +             return unilen;
  +     /* calcurate # of characters consumed */
  +     buf[unilen] = '\0';
  +     buflen = cap_escape(buf, buf_xlate, XLATE_BUFSIZ);
  +     return buflen;
  +}
  +
  +static struct nls_table table = {
  +     "cp932",
  +     uni2char_xlate,
  +     char2uni_xlate,
  +     charset2lower,
  +     charset2upper,
  +     THIS_MODULE,
  +};
  +#else
   static struct nls_table table = {
        "cp932",
        uni2char,
  @@ -7891,6 +7992,7 @@
        charset2upper,
        THIS_MODULE,
   };
  +#endif
   
   static int __init init_nls_cp932(void)
   {
 

Hiroyuki Ohsaki (ohsaki[atmark]lsnl.jp)