Bruger:Byrial/programmer/simple title da.c
Udseende
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <wctype.h>
#include <wchar.h>
#include "simple_title.h"
/*
* Make a simplified title
*/
const char *make_simple_title (const char *title, int id, bool *utf8error)
{
static char simple_title[256];
char *from = (char *) title;
// Pass articles etc.
if (strncmp (title, "Den_", 4) == 0 || // Danish article
strncmp (title, "Det_", 4) == 0 || // Danish article
strncmp (title, "The_", 4) == 0 || // English article
strncmp (title, "Sir_", 4) == 0) // English title
{
from += 4;
}
else if (strncmp (title, "De_", 3) == 0) // Danish article
{
from += 3;
}
char *to = simple_title;
while (*from)
{
unsigned char ch = *from;
if (ch < 128)
{
// ASCII character
if (isalnum (ch))
{
*to = tolower (ch);
++ to;
}
++ from;
}
else
{
// multibyte UTF-8 character
wchar_t wch;
size_t retval = mbrtowc (&wch, from, 6, NULL);
if (retval == (size_t) -1)
{
*utf8error = true;
/* printf ("make_simple_title: "
"Invalid UTF-8 character in '%s', id = %d\n",
title, id); */
// Skip this byte
++ from;
continue;
}
if (retval == (size_t) -2)
{
// This should never happen
printf ("make_simple_title: "
"Too long multibyte char in '%s', id = %d\n",
title, id);
// Skip this byte
++ from;
continue;
}
from += retval;
if (! iswalnum (wch))
{
if (wch == L'¹')
*to ++ = '1';
if (wch == L'²')
*to ++ = '2';
else if (wch == L'³')
*to ++ = '3';
continue;
}
wch = towlower (wch);
switch (wch)
{
case L'ä': // German, Swedish
case L'æ': // Icelandic, Danish, Norwegian
case L'ǽ':
case L'œ': // oe ligatur (French, Latin),
// but, alas, also seen used instead of æ
*to ++ = 'a';
*to ++ = 'e';
break;
case L'ö': // Icelandic, Swedish, German
case L'ø': // Danish, Norgevian
case L'ǿ':
*to ++ = 'o';
*to ++ = 'e';
break;
case L'å': // Danish, Norgevian
case L'ǻ':
*to ++ = 'a';
*to ++ = 'a';
break;
// Just ignore all other diacritics
case L'á':
case L'à':
case L'â':
case L'ã':
case L'ă':
case L'ā':
case L'ą':
case L'ạ':
case L'ª':
case L'ǎ':
case L'ả':
case L'ấ':
case L'ẫ':
case L'ậ':
case L'ắ':
case L'ẩ':
case L'ầ':
*to ++ = 'a';
break;
case L'ç':
case L'ć':
case L'č':
case L'ĉ':
case L'ℂ':
*to ++ = 'c';
break;
case L'đ':
case L'ď':
case L'ḍ':
case L'ð': // Transcribed d in Danish
*to ++ = 'd';
break;
case L'é':
case L'è':
case L'ê':
case L'ë':
case L'ė':
case L'ē':
case L'ě':
case L'ę':
case L'ə':
case L'ĕ':
case L'ễ':
case L'ế':
case L'ệ':
case L'ℓ':
*to ++ = 'e';
break;
case L'ğ':
case L'ĝ':
case L'ģ':
case L'ġ':
*to ++ = 'g';
break;
case L'ĥ':
case L'ћ':
case L'ḥ':
case L'ħ':
*to ++ = 'h';
break;
case L'í':
case L'ì':
case L'î':
case L'ĩ':
case L'ï':
case L'ı':
case L'ī':
case L'ї':
case L'ĭ':
case L'i': // Normal i - lowercase of İ
case L'ǐ':
case L'ị':
case L'ɨ':
case L'į':
*to ++ = 'i';
break;
case L'ij':
*to ++ = 'i';
*to ++ = 'j';
break;
case L'ĵ':
*to ++ = 'j';
break;
case L'ќ':
case L'ķ':
*to ++ = 'k';
break;
case L'ł':
case L'ļ':
case L'ľ':
*to ++ = 'l';
break;
case L'ñ':
case L'ń':
case L'ň':
case L'ņ':
case L'ṇ':
case L'ŋ':
case L'ℕ':
*to ++ = 'n';
break;
case L'ó':
case L'ò':
case L'ô':
case L'õ':
case L'ō':
case L'ő':
case L'ọ':
case L'ǫ':
case L'º':
case L'ơ':
case L'ồ':
case L'ố':
case L'ờ':
case L'ổ':
case L'ớ':
case L'ỗ':
case L'ŏ':
case L'ǒ':
*to ++ = 'o';
break;
case L'ℚ':
*to ++ = 'q';
break;
case L'ř':
case L'ℝ':
*to ++ = 'r';
break;
case L'š':
case L'ş':
case L'ś':
case L'ſ':
case L'ŝ':
case L'ș':
case L'ṣ':
*to ++ = 's';
break;
case L'ß':
*to ++ = 's';
*to ++ = 's';
break;
case L'ţ':
case L'ť':
case L'ṭ':
*to ++ = 't';
break;
case L'þ': // Transcribed th in Danish
*to ++ = 't';
*to ++ = 'h';
break;
case L'ú':
case L'ù':
case L'û':
case L'ũ':
case L'ū':
case L'ů':
case L'ư':
case L'ǔ':
case L'ų':
case L'ŭ':
case L'џ':
case L'ṳ':
case L'ű':
case L'ứ':
case L'ữ':
case L'ự':
case L'ừ':
case L'µ':
*to ++ = 'u';
break;
case L'ŵ':
*to ++ = 'w';
break;
case L'ý':
case L'ü':
case L'ŷ':
case L'ÿ':
case L'ỳ':
case L'ỹ':
case L'ў':
case L'ẏ':
*to ++ = 'y';
break;
case L'ż':
case L'ž':
case L'ź':
case L'ℤ':
*to ++ = 'z';
break;
default:
// See http://en.wikipedia.org/wiki/Mapping_of_Unicode_character_planes
if (wch >= 0x02B0 && wch <= 0x02FF) // Spacing Modifier Letters
break;
if (wch >= 0x0370 && wch <= 0x03FF) // Greek and Coptic
break;
if (wch >= 0x0400 && wch <= 0x04FF) // Cyrillic
break;
if (wch >= 0x0530 && wch <= 0x058F) // Armenian
break;
if (wch >= 0x0590 && wch <= 0x05FF) // Hebrew
break;
if (wch >= 0x0600 && wch <= 0x06FF) // Arabic
break;
if (wch >= 0x0900 && wch <= 0x097F) // Devanagari
break;
if (wch >= 0x10A0 && wch <= 0x10FF) // Georgian
break;
if (wch >= 0x1400 && wch <= 0x167F) // Inified Canadian Aboriginal Syllabics
break;
if (wch >= 0x3040 && wch <= 0x309F) // Hiragana
break;
if (wch >= 0x30A0 && wch <= 0x30FF) // Katakana
break;
if (wch >= 0x4E00 && wch <= 0x9FFF) // CJK Unified Ideographs
break;
if (wch >= 0xAC00 && wch <= 0xD7AF) // Hangul
break;
// Not handled yet ...
{
char lchar[] = "123456";
size_t len = wcrtomb (lchar, wch, NULL);
if (len == (size_t) -1)
printf ("Illegal character in '%s', id = %d, code = %d (%X)\n",
title, id, wch, wch);
else
printf ("Unhandled character in '%s' (%*s), id = %d, code = %d (%X)\n",
title, len, lchar, id, wch, wch);
}
// Take everything not catched as is
to += wcrtomb (to, wch, NULL);
break;
}
}
}
*to = '\0';
return simple_title;
}