Unicode String Functions – Family Historian User Group

Important note: It is recommended that you use the utf8 library in Family Historian 5, 6 and 7 (combined with the compat53 library in Family Historian 5 and 6), to access a set of UTF8 string handling functions which supersede the code below. See Lua References and Library Modules for detail on how to install the libraries and find their documentation.

You should only use the code below if you cannot use the recommended libraries.

The standard Lua String Manipulation functions are designed to work with ANSI encoded text, and some do not handle Unicode UTF-8 encoded text correctly.

The following functions supplement the Lua string library, and adapt to the current string encoding in either ƒh V5 or ƒh V6.

Requires: None

Code

-- Supply current file encoding format --
function encoding()
	if fhGetAppVersion() > 5 then return fhGetStringEncoding() end
	return "ANSI"
end -- function encoding
 
local dicUpper = { }
local dicLower = { }
local dicCaseX = { }
 
-- ASCII unaccented letter translations for Upper, Lower, and Case Insensitive
for intUpper = string.byte("A"), string.byte("Z") do
   local strUpper = string.char(intUpper)
   local strLower = string.char(intUpper - string.byte("A") + string.byte("a"))
   dicUpper[strLower] = strUpper
   dicLower[strUpper] = strLower
   local strCaseX = "["..strUpper..strLower.."]"
   dicCaseX[strLower] = strCaseX
   dicCaseX[strUpper] = strCaseX
end
 
-- Supply character length of ANSI text --
function length(strTxt)
   return string.len(strTxt or "")
end -- function length
 
-- Supply character substring of ANSI text --
function substring(strTxt,i,j)
   return string.sub(strTxt or "",i,j)
end -- function substring
 
-- Translate upper/lower case ANSI letters to pattern that matches both --
function caseless(strTxt)
   strTxt = tostring(strTxt or ""):gsub("[A-Za-z]",dicCaseX)
   return strTxt
end -- function caseless
 
if encoding() == "UTF-8" then
 
   -- Supply character length of UTF-8 text --
   function length(strTxt)
      isFlag = fhIsConversionLossFlagSet()
      strTxt = fhConvertUTF8toANSI(strTxt or "")
      fhSetConversionLossFlag(isFlag)
      return string.len(strTxt)
   end -- function length
 
   -- Supply character substring of UTF-8 text --
   function substring(strTxt,i,j)
      local strSub = ""
      j = j or -1
      if j < 0 then j = j + length(strTxt) + 1 end
      if i < 0 then i = i + length(strTxt) + 1 end
      for strChr in string.gmatch(strTxt or "","([%z\1-\127\194-\244][\128-\191]*)") do
         if j <= 0 then break end
         j = j - 1
         i = i - 1
         if i <= 0 then strSub = strSub..strChr end
      end
      return strSub
   end -- function substring
 
   -- Translate lower case to upper case UTF-8 letters --
   function upper(strTxt)
      strTxt = tostring(strTxt or ""):gsub("([a-z\194-\244][\128-\191]*)",dicUpper)
      return strTxt
   end -- function upper
 
   -- Translate upper case to lower case UTF-8 letters --
   function lower(strTxt)
      strTxt = tostring(strTxt or ""):gsub("([A-Z\194-\244][\128-\191]*)",dicLower)
      return strTxt
   end -- function lower
 
   -- Translate upper/lower case UTF-8 letters to pattern that matches both --
   function caseless(strTxt)
      strTxt = tostring(strTxt or ""):gsub("([A-Za-z\194-\244][\128-\191]*)",dicCaseX)
      return strTxt
   end -- function caseless
 
   -- Following tables use ASCII numeric coding to be immune from ANSI/UTF-8 encoding --
 
   local arrPairs =   -- Upper & Lower case groups of UTF-8 letters with same prefix --
   {--   { Prefix, Beg , End , Inc, Offset Upper > Lower },   -- These include all ANSI letters and more
      { "\195", 0x80, 0x96,  1 , 32 },   -- 195=0xC3 À U+00C0 to Ö U+00D6 and à U+00E0 to ö U+00F6
      { "\195", 0x98, 0x9E,  1 , 32 },   -- 195=0xC3 Ø U+00D8 to Þ U+00DE and ø U+00F8 to þ U+00FE
      { "\196", 0x80, 0xB6,  2 ,  1 },   -- 196=0xC4 Ā U+0100 to ķ U+0137 in pairs
      { "\196", 0xB9, 0xBD,  2 ,  1 },   -- 196=0xC4 Ĺ U+0139 to ľ U+013E in pairs
      { "\197", 0x81, 0x87,  2 ,  1 },   -- 197=0xC5 Ł U+0141 to ň U+0148 in pairs
      { "\197", 0x8A, 0xB6,  2 ,  1 },   -- 197=0xC5 Ŋ U+014A to ŷ U+0177 in pairs
      { "\197", 0xB9, 0xBD,  2 ,  1 },   -- 197=0xC5 Ź U+0179 to ž U+017E in pairs
      { "\198", 0x82, 0x84,  2 ,  1 },   -- 198=0xC6 Ƃ  U+0182 to ƅ  U+0185 in pairs
      -- Add more Unicode groups here as usage increases --
   }
   local dicPairs =   -- Upper v Lower case UTF-8 letters that don't fit groups above --
   {   [string.char(0xC4,0xBF)] = string.char(0xC5,0x80),   -- Ŀ U+013F and ŀ U+0140
      [string.char(0xC5,0xB8)] = string.char(0xC3,0xBF),    -- Ÿ U+0178 and ÿ U+00FF
   }
 
   -- Populate the UTF-8 letter translation dictionaries --
   for intGroup, tblGroup in ipairs ( arrPairs ) do   -- UTF-8 accented letter groups
      strPrefix = tblGroup[1]
      for intUpper = tblGroup[2], tblGroup[3], tblGroup[4] do
         local strUpper = string.char(intUpper)
         local strLower = string.char(intUpper + tblGroup[5])
         local strCaseX = strPrefix.."["..strUpper..strLower.."]"
         strUpper = strPrefix..strUpper
         strLower = strPrefix..strLower
         dicUpper[strLower] = strUpper
         dicLower[strUpper] = strLower
         dicCaseX[strLower] = strCaseX
         dicCaseX[strUpper] = strCaseX
      end
   end
   for strUpper, strLower in pairs ( dicPairs ) do   -- UTF-8 accented letters where upper & lower have different prefix
      dicUpper[strLower] = strUpper
      dicLower[strUpper] = strLower
      local strCaseX = ""
      for intByte = 1, #strUpper do         -- Matches more than just the two letters, but can't do any better
         strCaseX = strCaseX.."["..strUpper:sub(intByte,intByte)..strLower:sub(intByte,intByte).."]"
      end
      dicCaseX[strLower] = strCaseX
      dicCaseX[strUpper] = strCaseX
   end
 
end

Usage

The text string may be ANSI encoded where each character is one byte, or it may be UTF-8 encoded where Ø and ø are both two bytes.

local strText = "AaØøZz"
print( string.upper(strText) )          -->> AAØØZZ
print( string.lower(strText) )          -->> aaøøzz
print( string.len(strText) )            -->> 6 or 8   length in bytes
print( string.length(strText) )         -->> 6        length in characters
print( string.sub(strText,3,4) )        -->> Øø or Ø  substring in bytes
print( string.substring(strText,3,4) )  -->> Øø       substring in characters

Code

Usage

Related Content