Spring til indhold

Bruger:Byrial/programmer/simple title da.c

Fra Wikipedia, den frie encyklopædi
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <wctype.h>
#include <wchar.h>
#include "simple_title.h"

/*
 * Make a simplified title
 */
const char *make_simple_title (const char *title, int id, bool *utf8error)
{ 
   static char simple_title[256];
   char *from = (char *) title;

   // Pass articles etc.
   if (strncmp (title, "Den_", 4) == 0 || // Danish article
       strncmp (title, "Det_", 4) == 0 || // Danish article
       strncmp (title, "The_", 4) == 0 || // English article
       strncmp (title, "Sir_", 4) == 0)   // English title
     {
	from += 4;
     }
   else if (strncmp (title, "De_", 3) == 0) // Danish article
     {
	from += 3;
     }

   char *to = simple_title;
   while (*from)
     {
	unsigned char ch = *from;
	if (ch < 128)
	  {	     
	     // ASCII character
	     if (isalnum (ch))
	       {
		  *to = tolower (ch);
		  ++ to;
	       }
	     ++ from;
	  }
	else
	  {
	     // multibyte UTF-8 character
	     wchar_t wch;
	     size_t retval = mbrtowc (&wch, from, 6, NULL);
	     if (retval == (size_t) -1)
	       {
		  *utf8error = true;
		  /* printf ("make_simple_title: "
			  "Invalid UTF-8 character in '%s', id = %d\n",
			  title, id); */
		  // Skip this byte
		  ++ from;
		  continue;
	       }
	     if (retval == (size_t) -2)
	       {
		  // This should never happen
		  printf ("make_simple_title: "
			  "Too long multibyte char in '%s', id = %d\n",
			  title, id);
		  // Skip this byte
		  ++ from;
		  continue;
	       }
	     from += retval;

	     if  (! iswalnum (wch))
	       {
		  if (wch == L'¹')
		    *to ++ = '1';
		  if (wch == L'²')
		    *to ++ = '2';
		  else if (wch == L'³')
		    *to ++ = '3';
	       continue;
	       }
	     
	     wch = towlower (wch);
	     switch (wch)
	       {
		case L'ä': // German, Swedish
		case L'æ': // Icelandic, Danish, Norwegian
		case L'ǽ':
		case L'œ': // oe ligatur (French, Latin),
		           // but, alas, also seen used instead of æ
		  *to ++ = 'a';
		  *to ++ = 'e';
		  break;

		case L'ö': // Icelandic, Swedish, German
		case L'ø': // Danish, Norgevian
		case L'ǿ':
		  *to ++ = 'o';
		  *to ++ = 'e';
		  break;
		  
		case L'å': // Danish, Norgevian
		case L'ǻ':
		  *to ++ = 'a';
		  *to ++ = 'a';
		  break;
		  
		  // Just ignore all other diacritics
		case L'á':
		case L'à':
		case L'â':
		case L'ã':
		case L'ă':
		case L'ā':
		case L'ą':
		case L'ạ':
		case L'ª':
		case L'ǎ':
		case L'ả':
		case L'ấ':
		case L'ẫ':
		case L'ậ':
		case L'ắ':
		case L'ẩ':
		case L'ầ':
		  *to ++ = 'a';
		  break;

		case L'ç':
		case L'ć':
		case L'č':
		case L'ĉ':
		case L'ℂ':
		  *to ++ = 'c';
		  break;

		case L'đ':
		case L'ď':
		case L'ḍ':
		case L'ð': // Transcribed d in Danish
		  *to ++ = 'd';
		  break;
		  
		case L'é':
		case L'è':
		case L'ê':
		case L'ë':
		case L'ė':
		case L'ē':
		case L'ě':
		case L'ę':
		case L'ə':
		case L'ĕ':
		case L'ễ':
		case L'ế':
		case L'ệ':
		case L'ℓ':
		  *to ++ = 'e';
		  break;

		case L'ğ':
		case L'ĝ':
		case L'ģ':
		case L'ġ':
		  *to ++ = 'g';
		  break;

		case L'ĥ':
		case L'ћ':
		case L'ḥ':
		case L'ħ':
		  *to ++ = 'h';
		  break;
		  
		case L'í':
		case L'ì':
		case L'î':
		case L'ĩ':
		case L'ï':
		case L'ı':
		case L'ī':
		case L'ї':
		case L'ĭ':
		case L'i': // Normal i - lowercase of İ
		case L'ǐ':
		case L'ị':
		case L'ɨ':
		case L'į':
		  *to ++ = 'i';
		  break;

		case L'ij':
		  *to ++ = 'i';
		  *to ++ = 'j';
		  break;
		  
		case L'ĵ':
		  *to ++ = 'j';
		  break;
		  
		case L'ќ':
		case L'ķ':
		  *to ++ = 'k';
		  break;
		  
		case L'ł':
		case L'ļ':
		case L'ľ':
		  *to ++ = 'l';
		  break;
		  
		case L'ñ':
		case L'ń':
		case L'ň':
		case L'ņ':
		case L'ṇ':
		case L'ŋ':
		case L'ℕ':
		  *to ++ = 'n';
		  break;
		  
		case L'ó':
		case L'ò':
		case L'ô':
		case L'õ':
		case L'ō':
		case L'ő':
		case L'ọ':
		case L'ǫ':
		case L'º':
		case L'ơ':
		case L'ồ':
		case L'ố':
		case L'ờ':
		case L'ổ':
		case L'ớ':
		case L'ỗ':
		case L'ŏ':
		case L'ǒ':
		  *to ++ = 'o';
		  break;
	
		case L'ℚ':
		  *to ++ = 'q';
		  break;

		case L'ř':
		case L'ℝ':
		  *to ++ = 'r';
		  break;

		case L'š':
		case L'ş':
		case L'ś':
		case L'ſ':
		case L'ŝ':
		case L'ș':
		case L'ṣ':
		  *to ++ = 's';
		  break;

		case L'ß':
		  *to ++ = 's';
		  *to ++ = 's';
		  break;

		case L'ţ':
		case L'ť':
		case L'ṭ':
		  *to ++ = 't';
		  break;

		case L'þ': // Transcribed th in Danish
		  *to ++ = 't';
		  *to ++ = 'h';
		  break;

		case L'ú':
		case L'ù':
		case L'û':
		case L'ũ':
		case L'ū':
		case L'ů':
		case L'ư':
		case L'ǔ':
		case L'ų':
		case L'ŭ':
		case L'џ':
		case L'ṳ':
		case L'ű':
		case L'ứ':
		case L'ữ':
		case L'ự':
		case L'ừ':
		case L'µ':
		  *to ++ = 'u';
		  break;

		case L'ŵ':
		  *to ++ = 'w';
		  break;

		case L'ý':
		case L'ü':
		case L'ŷ':
		case L'ÿ':
		case L'ỳ':
		case L'ỹ':
		case L'ў':
		case L'ẏ':
		  *to ++ = 'y';
		  break;
		  
		case L'ż':
		case L'ž':
		case L'ź':
		case L'ℤ':
		  *to ++ = 'z';
		  break;

		default:
		  // See http://en.wikipedia.org/wiki/Mapping_of_Unicode_character_planes
		  if (wch >= 0x02B0 && wch <= 0x02FF) // Spacing Modifier Letters
		    break;
		  if (wch >= 0x0370 && wch <= 0x03FF) // Greek and Coptic
		    break;
		  if (wch >= 0x0400 && wch <= 0x04FF) // Cyrillic
		    break;
		  if (wch >= 0x0530 && wch <= 0x058F) // Armenian
		    break;
		  if (wch >= 0x0590 && wch <= 0x05FF) // Hebrew
		    break;
		  if (wch >= 0x0600 && wch <= 0x06FF) // Arabic
		    break;
		  if (wch >= 0x0900 && wch <= 0x097F) // Devanagari
		    break;
		  if (wch >= 0x10A0 && wch <= 0x10FF) // Georgian
		    break;
		  if (wch >= 0x1400 && wch <= 0x167F) // Inified Canadian Aboriginal Syllabics
		    break;
		  if (wch >= 0x3040 && wch <= 0x309F) // Hiragana
		    break;
		  if (wch >= 0x30A0 && wch <= 0x30FF) // Katakana
		    break;
		  if (wch >= 0x4E00 && wch <= 0x9FFF) // CJK Unified Ideographs
		    break;
		  if (wch >= 0xAC00 && wch <= 0xD7AF) // Hangul
		    break;
		  
		  // Not handled yet ...
		    {
		       char lchar[] = "123456";
		       size_t len = wcrtomb (lchar, wch, NULL);
		       if (len == (size_t) -1)
			 printf ("Illegal character in '%s', id = %d, code = %d (%X)\n",
				 title, id, wch, wch);
		       else
			 printf ("Unhandled character in '%s' (%*s), id = %d, code = %d (%X)\n",
			       title, len, lchar, id, wch, wch);
		    }
		  
		  // Take everything not catched as is
		  to += wcrtomb (to, wch, NULL);
		  break;
	       }
	  }	
     }
   *to = '\0';
   return simple_title;   
}