Important note: It is recommended that you use the utf8 library in Family Historian 5, 6 and 7 (combined with the compat53 library in Family Historian 5 and 6), to access a set of UTF8 string handling functions which supersede the code below. See Lua References and Library Modules for detail on how to install the libraries and find their documentation.
You should only use the code below if you cannot use the recommended libraries.
The standard Lua String Manipulation functions are designed to work with ANSI encoded text, and some do not handle Unicode UTF-8 encoded text correctly.
The following functions supplement the Lua string library, and adapt to the current string encoding in either ƒh V5 or ƒh V6.
Requires: None
Code
-
-- Supply current file encoding format -- function encoding() if fhGetAppVersion() > 5 then return fhGetStringEncoding() end return "ANSI" end -- function encoding local dicUpper = { } local dicLower = { } local dicCaseX = { } -- ASCII unaccented letter translations for Upper, Lower, and Case Insensitive for intUpper = string.byte("A"), string.byte("Z") do local strUpper = string.char(intUpper) local strLower = string.char(intUpper - string.byte("A") + string.byte("a")) dicUpper[strLower] = strUpper dicLower[strUpper] = strLower local strCaseX = "["..strUpper..strLower.."]" dicCaseX[strLower] = strCaseX dicCaseX[strUpper] = strCaseX end -- Supply character length of ANSI text -- function length(strTxt) return string.len(strTxt or "") end -- function length -- Supply character substring of ANSI text -- function substring(strTxt,i,j) return string.sub(strTxt or "",i,j) end -- function substring -- Translate upper/lower case ANSI letters to pattern that matches both -- function caseless(strTxt) strTxt = tostring(strTxt or ""):gsub("[A-Za-z]",dicCaseX) return strTxt end -- function caseless if encoding() == "UTF-8" then -- Supply character length of UTF-8 text -- function length(strTxt) isFlag = fhIsConversionLossFlagSet() strTxt = fhConvertUTF8toANSI(strTxt or "") fhSetConversionLossFlag(isFlag) return string.len(strTxt) end -- function length -- Supply character substring of UTF-8 text -- function substring(strTxt,i,j) local strSub = "" j = j or -1 if j < 0 then j = j + length(strTxt) + 1 end if i < 0 then i = i + length(strTxt) + 1 end for strChr in string.gmatch(strTxt or "","([%z\1-\127\194-\244][\128-\191]*)") do if j <= 0 then break end j = j - 1 i = i - 1 if i <= 0 then strSub = strSub..strChr end end return strSub end -- function substring -- Translate lower case to upper case UTF-8 letters -- function upper(strTxt) strTxt = tostring(strTxt or ""):gsub("([a-z\194-\244][\128-\191]*)",dicUpper) return strTxt end -- function upper -- Translate upper case to lower case UTF-8 letters -- function lower(strTxt) strTxt = tostring(strTxt or ""):gsub("([A-Z\194-\244][\128-\191]*)",dicLower) return strTxt end -- function lower -- Translate upper/lower case UTF-8 letters to pattern that matches both -- function caseless(strTxt) strTxt = tostring(strTxt or ""):gsub("([A-Za-z\194-\244][\128-\191]*)",dicCaseX) return strTxt end -- function caseless -- Following tables use ASCII numeric coding to be immune from ANSI/UTF-8 encoding -- local arrPairs = -- Upper & Lower case groups of UTF-8 letters with same prefix -- {-- { Prefix, Beg , End , Inc, Offset Upper > Lower }, -- These include all ANSI letters and more { "\195", 0x80, 0x96, 1 , 32 }, -- 195=0xC3 À U+00C0 to Ö U+00D6 and à U+00E0 to ö U+00F6 { "\195", 0x98, 0x9E, 1 , 32 }, -- 195=0xC3 Ø U+00D8 to Þ U+00DE and ø U+00F8 to þ U+00FE { "\196", 0x80, 0xB6, 2 , 1 }, -- 196=0xC4 Ā U+0100 to ķ U+0137 in pairs { "\196", 0xB9, 0xBD, 2 , 1 }, -- 196=0xC4 Ĺ U+0139 to ľ U+013E in pairs { "\197", 0x81, 0x87, 2 , 1 }, -- 197=0xC5 Ł U+0141 to ň U+0148 in pairs { "\197", 0x8A, 0xB6, 2 , 1 }, -- 197=0xC5 Ŋ U+014A to ŷ U+0177 in pairs { "\197", 0xB9, 0xBD, 2 , 1 }, -- 197=0xC5 Ź U+0179 to ž U+017E in pairs { "\198", 0x82, 0x84, 2 , 1 }, -- 198=0xC6 Ƃ U+0182 to ƅ U+0185 in pairs -- Add more Unicode groups here as usage increases -- } local dicPairs = -- Upper v Lower case UTF-8 letters that don't fit groups above -- { [string.char(0xC4,0xBF)] = string.char(0xC5,0x80), -- Ŀ U+013F and ŀ U+0140 [string.char(0xC5,0xB8)] = string.char(0xC3,0xBF), -- Ÿ U+0178 and ÿ U+00FF } -- Populate the UTF-8 letter translation dictionaries -- for intGroup, tblGroup in ipairs ( arrPairs ) do -- UTF-8 accented letter groups strPrefix = tblGroup[1] for intUpper = tblGroup[2], tblGroup[3], tblGroup[4] do local strUpper = string.char(intUpper) local strLower = string.char(intUpper + tblGroup[5]) local strCaseX = strPrefix.."["..strUpper..strLower.."]" strUpper = strPrefix..strUpper strLower = strPrefix..strLower dicUpper[strLower] = strUpper dicLower[strUpper] = strLower dicCaseX[strLower] = strCaseX dicCaseX[strUpper] = strCaseX end end for strUpper, strLower in pairs ( dicPairs ) do -- UTF-8 accented letters where upper & lower have different prefix dicUpper[strLower] = strUpper dicLower[strUpper] = strLower local strCaseX = "" for intByte = 1, #strUpper do -- Matches more than just the two letters, but can't do any better strCaseX = strCaseX.."["..strUpper:sub(intByte,intByte)..strLower:sub(intByte,intByte).."]" end dicCaseX[strLower] = strCaseX dicCaseX[strUpper] = strCaseX end end
Usage
The text string may be ANSI encoded where each character is one byte, or it may be UTF-8 encoded where Ø and ø are both two bytes.
-
local strText = "AaØøZz" print( string.upper(strText) ) -->> AAØØZZ print( string.lower(strText) ) -->> aaøøzz print( string.len(strText) ) -->> 6 or 8 length in bytes print( string.length(strText) ) -->> 6 length in characters print( string.sub(strText,3,4) ) -->> Øø or Ø substring in bytes print( string.substring(strText,3,4) ) -->> Øø substring in characters