Bruger:Byrial/programmer/simple title.c
Udseende
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <wctype.h>
#include <wchar.h>
#include "simple_title.h"
#include <sys/types.h>
#include <regex.h>
#include <stdlib.h>
regex_t *get_preg ()
{
static int init = 0;
static regex_t preg;
if (! init)
{
int rc = regcomp (&preg,
"^(i|ii|iii|iiii|iv|v|vi|vii|viii|ix|x|xi|xii|xiii|"
"I|II|III|IIII|IV|V|VI|VII|VIII|IX|X|XI|XII|XIII)(_|\\(|$)",
REG_EXTENDED);
if (rc)
{
char errbuf[128];
regerror (rc, &preg, errbuf, sizeof errbuf);
printf ("regcomp() failed with code %d: %s\n", rc, errbuf);
exit (1);
}
init = 1;
}
return &preg;
}
/*
* Convert roman numerals. Works for values 1 to 13
*/
int convert_roman (const char *s)
{
int i = 0;
while (*s == 'i' || *s == 'I')
{
i ++;
s ++;
}
if (*s == 'v' || *s == 'V')
{
i = 5 - i;
s ++;
}
else if (*s == 'x' || *s == 'X')
{
i = 10 - i;
s ++;
}
while (*s == 'i' || *s == 'I')
{
i ++;
s ++;
}
return i;
}
/*
* Make a simplified title
*/
const char *make_simple_title (const char *title, int id,
bool *utf8error, const char *lang)
{
static char simple_title[256];
char *from = (char *) title;
// Pass articles etc.
if (strncmp (title, "Den_", 4) == 0 || // Danish article
strncmp (title, "Det_", 4) == 0 || // Danish article
strncmp (title, "The_", 4) == 0 || // English article
strncmp (title, "Dei_", 4) == 0 || // Nynorsk article
strncmp (title, "Sir_", 4) == 0) // English title
{
from += 4;
}
else if (strncmp (title, "De_", 3) == 0) // Danish article
{
from += 3;
}
char *to = simple_title;
while (*from)
{
unsigned char ch = *from;
if (ch < 128)
{
// ASCII character
if (ch == '_' || ch == '(')
{
// Check for some Roman numerals
regex_t *preg = get_preg();
regmatch_t pmatch[2];
int rc = regexec (preg, from + 1, 2, pmatch, 0);
if (rc == 0)
{
// match
int i = convert_roman (from + 1);
if (i > 9)
{
*to = '0' + (i / 10);
to ++;
}
*to = '0' + (i % 10);
to ++;
from += pmatch[0].rm_eo;
}
else if (rc == REG_NOMATCH)
{
// No match - drop the character
}
else
{
char errbuf[128];
regerror (rc, preg, errbuf, sizeof errbuf);
printf ("regexec() failed with code %d: %s\n", rc, errbuf);
exit (1);
}
}
if (isalnum (ch))
{
*to = tolower (ch);
++ to;
}
++ from;
}
else
{
// multibyte UTF-8 character
wchar_t wch;
size_t wch_len = mbrtowc (&wch, from, 6, NULL);
if (wch_len == (size_t) -1)
{
*utf8error = true;
// printf ("make_simple_title: "
// "Invalid UTF-8 character in '%s', id = %d\n",
// title, id);
// Skip this byte
++ from;
continue;
}
if (wch_len == (size_t) -2)
{
// This should never happen
printf ("make_simple_title: "
"Too long multibyte char in '%s', id = %d\n",
title, id);
// Skip this byte
++ from;
continue;
}
from += wch_len;
if (! iswalnum (wch))
{
if (wch == L'¹')
*to ++ = '1';
else if (wch == L'²')
*to ++ = '2';
else if (wch == L'³')
*to ++ = '3';
continue;
}
wch = towlower (wch);
if ((wch >= 0xFF10 && wch <= 0xFF19) // Fullwidth digits 0-9
|| (wch >= 0xFF41 && wch <= 0xFF5A)) // Fullwidth letters a-z
{
*to ++ = (wch - 0xFF10 + '0');
continue;
}
else if (wch >= 0x1D538 && wch <= 0x1D551) // Mathematical double-struck capital A-Z
{
*to ++ = (wch - 0x1D538 + 'a');
continue;
}
else if (wch >= 0x1D552 && wch <= 0x1D56B) // Mathematical double-struck small a-z
{
*to ++ = (wch - 0x1D552 + 'a');
continue;
}
else if (wch >= 0x2170 && wch <= 0x2178) // Small Roman Numeral 1-9
{
*to ++ = (wch - 0x2170 + '1'); // Is conversion to letters better?
continue;
}
if (strcmp(lang, "is") == 0)
{
if (wch == L'á' ||
wch == L'ð' || // Transcribed d in Danish
wch == L'é' ||
wch == L'í' ||
wch == L'ó' ||
wch == L'ú' ||
wch == L'ý' ||
wch == L'þ') // Transcribed th in Danish
// These are all icelandic normal letters - use them as is
{
to += wcrtomb (to, wch, NULL);
continue;
}
}
switch (wch)
{
case L'ȝ': // U+21D Letter yogh in Middle English
*to ++ = '3'; // number 3
break;
case L'ä': // German, Swedish
case L'æ': // Icelandic, Danish, Norwegian
case L'ǽ':
case L'ǣ':
case L'œ': // oe ligatur (French, Latin),
// but, alas, also seen used instead of æ
*to ++ = 'a';
*to ++ = 'e';
break;
case L'ö': // Icelandic, Swedish, German
case L'ø': // Danish, Norgevian
case L'ǿ':
*to ++ = 'o';
*to ++ = 'e';
break;
case L'å': // Danish, Norgevian
case L'ǻ':
*to ++ = 'a';
*to ++ = 'a';
break;
// Just ignore all other diacritics
case L'á':
case L'à':
case L'â':
case L'ã':
case L'ă':
case L'ā':
case L'ą':
case L'ạ':
case L'ª':
case L'ǎ':
case L'ả':
case L'ấ':
case L'ẫ':
case L'ậ':
case L'ắ':
case L'ẩ':
case L'ầ':
case L'ằ':
case L'ặ':
case L'ɐ': // U+0250 Near-open central vowel
*to ++ = 'a';
break;
case L'ḃ':
case L'ƀ':
case L'ƅ':
case L'ɓ': // U+0253 (IPA voiced bilabial implosive, used in African languages)
case L'ḇ':
*to ++ = 'b';
break;
case L'ç':
case L'ć':
case L'č':
case L'ĉ':
case L'ℂ':
case L'ċ':
*to ++ = 'c';
break;
case L'đ':
case L'ď':
case L'ḍ':
case L'ḋ':
case L'ð': // Transcribed d in Danish
case L'ɖ':
case L'ɗ': // U+0257 (IPA voiced dental or alveolar implosive, used in African languages)
case L'ḏ':
*to ++ = 'd';
break;
case L'dž':
case L'dz':
*to ++ = 'd';
*to ++ = 'z';
break;
case L'é':
case L'è':
case L'ê':
case L'ë':
case L'ė':
case L'ē':
case L'ě':
case L'ę':
case L'ə':
case L'ĕ':
case L'ễ':
case L'ế':
case L'ệ':
case L'ℓ':
case L'ẹ':
case L'ề':
case L'ể':
case L'ẽ':
case L'ḗ':
case L'ǝ': // U+01DD
*to ++ = 'e';
break;
case L'ḟ':
case L'ƒ':
*to ++ = 'f';
break;
case L'fi': // U+FB01 - fi ligature
*to ++ = 'f';
*to ++ = 'i';
break;
case L'fl': // U+FB02 - fl ligature
*to ++ = 'f';
*to ++ = 'l';
break;
case L'ğ':
case L'ĝ':
case L'ģ':
case L'ġ':
case L'ǧ':
case L'ḡ':
case L'ǵ': // U+01F5
case L'ǥ': // U+01E5
*to ++ = 'g';
break;
case L'ĥ':
case L'ћ':
case L'ḥ':
case L'ħ':
case L'ḫ': // U+1E2B
case L'ẖ': // U+1E96
*to ++ = 'h';
break;
case L'í':
case L'ì':
case L'î':
case L'ĩ':
case L'ï':
case L'ı':
case L'ī':
case L'ї':
case L'ĭ':
case L'i': // Normal i - lowercase of İ
case L'ǐ':
case L'ị':
case L'ɨ':
case L'į':
case L'ỉ':
case L'ɪ': // U+26A IPA Near-close near-front unrounded vowel
*to ++ = 'i';
break;
case L'ij':
*to ++ = 'i';
*to ++ = 'j';
break;
case L'ĵ':
case L'ʝ': // U+029D IPA Voiced palatal fricative
*to ++ = 'j';
break;
case L'ĸ':
*to ++ = 'k';
break;
case L'ќ':
case L'ķ':
case L'ḵ':
case L'ḱ': // U+1E31
case L'ⱪ': // U+2C6A
*to ++ = 'k';
break;
case L'ł':
case L'ļ':
case L'ľ':
case L'ḷ':
case L'ǃ':
case L'ĺ':
case L'ɬ':
*to ++ = 'l';
break;
case L'ṁ':
case L'ṃ':
*to ++ = 'm';
break;
case L'ñ':
case L'ń':
case L'ň':
case L'ņ':
case L'ṇ':
case L'ŋ':
case L'ℕ':
case L'ṉ':
case L'n':
case L'ṅ': // U+1E45
*to ++ = 'n';
break;
case L'ó':
case L'ò':
case L'ô':
case L'õ':
case L'ō':
case L'ő':
case L'ọ':
case L'ǫ':
case L'º':
case L'ơ':
case L'ồ':
case L'ố':
case L'ờ':
case L'ổ':
case L'ớ':
case L'ỗ':
case L'ŏ':
case L'ǒ':
case L'ộ':
case L'ợ':
// case L'o':
case L'ỏ':
case L'ở': // U+1EDF
*to ++ = 'o';
break;
case L'ṗ':
// case L'p':
*to ++ = 'p';
break;
case L'ℚ':
*to ++ = 'q';
break;
case L'ř':
case L'ℝ':
case L'ŕ':
case L'ṛ':
case L'ȑ':
case L'ŗ':
*to ++ = 'r';
break;
case L'š':
case L'ş':
case L'ś':
case L'ſ':
case L'ŝ':
case L'ș':
case L'ṣ':
case L'ṡ':
case L'ʂ':
*to ++ = 's';
break;
case L'ß':
*to ++ = 's';
*to ++ = 's';
break;
case L'ţ':
case L'ť':
case L'ṭ':
case L'ț':
case L'ṫ':
case L'ŧ':
case L'ṯ': // U+1E6F
*to ++ = 't';
break;
case L'þ': // Transcribed th in Danish
*to ++ = 't';
*to ++ = 'h';
break;
case L'ú':
case L'ù':
case L'û':
case L'ũ':
case L'ū':
case L'ů':
case L'ư':
case L'ǔ':
case L'ų':
case L'ŭ':
case L'џ':
case L'ṳ':
case L'ű':
case L'ứ':
case L'ữ':
case L'ự':
case L'ừ':
case L'µ':
case L'ủ':
case L'ǖ':
case L'ǘ':
case L'ǚ':
case L'ǜ':
case L'ụ':
case L'ử':
*to ++ = 'u';
break;
case L'ʋ': // U+28B Based on italic V, used in IPA and some African languages
*to ++ = 'v';
break;
case L'ŵ':
case L'ẁ':
case L'ẃ':
case L'ẅ':
*to ++ = 'w';
break;
case L'ý':
case L'ü':
case L'ŷ':
case L'ÿ':
case L'ỳ':
case L'ỹ':
case L'ў':
case L'ẏ':
case L'y':
*to ++ = 'y';
break;
case L'ż':
case L'ž':
case L'ź':
case L'ℤ':
case L'Ẓ':
case L'ẓ':
case L'ẑ':
case L'ʒ': // U+0292 - ezh or tailed z see http://en.wikipedia.org/wiki/Ezh_%28letter%29
// IPA for Voiced postalveolar fricative
*to ++ = 'z';
break;
case L'ǀ': // IPA dental click (0x01C0)
case L'ǂ': // IPA paletal click (0x01C2)
*to ++ = '|'; // Vertical bar
break;
case L'ǁ': // IPA lateral click
*to ++ = '|'; // Vertical bar
*to ++ = '|'; // Vertical bar
break;
case L'ʔ': // Glottal stop (0x0294)
*to ++ = '?'; // Question mark
break;
case L'ʾ': // U+02BE - Modifier letter right half ring -
// Used in translitterations - drop it.
case L'ʿ': // 0x02BF - Modifier letter left half ring -
// Used in latin translitterations of Hebrew, Arabic and other. Drop it.
case L'ʼ': // (0x02BC - "modifier letter apostrophe") - Drop it.
case L'ʻ': // "U+02BB MODIFIER LETTER TURNED COMMA" - Drop it.
case L'ˁ': // U+02C1 Superscript voiced pharyngeal fricative - Drop it.
case L'ˇ': // U+02C7 Caron or háček - Drop it.
case L'ˊ': // U+02CA - Drop it.
case L'ˤ': // U+02E4 Superscript voiced pharyngeal fricative - Drop it.
break;
case L'ɣ': // U+0263 Latin gamma used in some African languages.
to += wcrtomb (to, L'γ', NULL); // Replace with greek gamma
break;
case L'ʊ': // U+028A Latin upsilon used in IPA and some African languages.
to += wcrtomb (to, L'ω', NULL); // Replace with greek omega
break;
// Signs tobe dropped:
case L'ː': // U+02D0 Vowel length marker (looks like colon which is dropped)
case L'ˆ': // U+02C6 Modifier letter circumflex accent
case L'ƿ': // U+01BF Old English letter wynn (translitteration?)
break;
default:
// See http://en.wikipedia.org/wiki/Mapping_of_Unicode_character_planes
if (
(wch >= 0x0370 && wch <= 0x03FF) // Greek and Coptic
|| (wch >= 0x0400 && wch <= 0x04FF) // Cyrillic
|| (wch >= 0x0530 && wch <= 0x058F) // Armenian
|| (wch >= 0x0590 && wch <= 0x05FF) // Hebrew
|| (wch >= 0x0600 && wch <= 0x06FF) // Arabic
|| (wch >= 0x0700 && wch <= 0x074F) // Syriac
|| (wch >= 0x0750 && wch <= 0x077F) // Arabic Supplement
|| (wch >= 0x0780 && wch <= 0x07BF) // Thaana
|| (wch >= 0x0900 && wch <= 0x097F) // Devanagari
|| (wch >= 0x0980 && wch <= 0x09FF) // Bengali
|| (wch >= 0x0A00 && wch <= 0x0A7F) // Gurmukhi
|| (wch >= 0x0A80 && wch <= 0x0AFF) // Gujarati
|| (wch >= 0x0B00 && wch <= 0x0B7F) // Oriya
|| (wch >= 0x0B80 && wch <= 0x0BFF) // Tamil
|| (wch >= 0x0C00 && wch <= 0x0C7F) // Telugu
|| (wch >= 0x0C80 && wch <= 0x0CFF) // Kannada
|| (wch >= 0x0D00 && wch <= 0x0D7F) // Malayalam
|| (wch >= 0x0D80 && wch <= 0x0DFF) // Sinhala
|| (wch >= 0x0E00 && wch <= 0x0EFF) // Thai
|| (wch >= 0x0F00 && wch <= 0x0FFF) // Tibetan
|| (wch >= 0x10A0 && wch <= 0x10FF) // Georgian
|| (wch >= 0x1200 && wch <= 0x137F) // Ethiopic
|| (wch >= 0x1380 && wch <= 0x139F) // Ethiopic Supplement
|| (wch >= 0x13A0 && wch <= 0x13FF) // Cherokee
|| (wch >= 0x1400 && wch <= 0x167F) // Unified Canadian Aboriginal Syllabics
|| (wch >= 0x16A0 && wch <= 0x16FF) // Runic
|| (wch >= 0x1780 && wch <= 0x17FF) // Khmer
|| (wch >= 0x1800 && wch <= 0x18AF) // Mongolian
|| (wch >= 0x1F00 && wch <= 0x1FFF) // Greek Extended
|| (wch >= 0x3040 && wch <= 0x309F) // Hiragana
|| (wch >= 0x3000 && wch <= 0x303F) // CJK Symbols and Punctuation
|| (wch >= 0x30A0 && wch <= 0x30FF) // Katakana
|| (wch >= 0x3400 && wch <= 0x4DBF) // CJK Inified Ideographs Extension A
|| (wch >= 0x4E00 && wch <= 0x9FFF) // CJK Unified Ideographs
|| (wch >= 0xAC00 && wch <= 0xD7AF) // Hangul
|| (wch >= 0xFB50 && wch <= 0xFDFF) // Arabic Presentation Forms-A
|| (wch >= 0xFE70 && wch <= 0xFEFF) // Arabic Presentation Forms-B
|| (wch >= 0x10330 && wch <= 0x1034F) // Gothic
|| (wch >= 0x20000 && wch <= 0x2A6DF) // CJK Unified Ideographs Extension B
)
{
// Character from known, but unhandled script
to += wcrtomb (to, wch, NULL);
}
else
{
// if ((wch >= 0x02B0 && wch <= 0x02FF) // Spacing Modifier Letters
// - needs individuel treatment
// Character not handled yet ...
// Print message if the title contains more than this character:
if (strlen (title) > wch_len)
printf ("Unhandled character in '%s' (%lc), id = %d, code = %d (%X)\n",
title, wch, id, wch, wch);
// Take everything not catched as is
to += wcrtomb (to, wch, NULL);
}
break;
}
}
}
*to = '\0';
return simple_title;
}