Download or view Unicode.frink in plain text format
/* This program contains routines to handle Unicode, especially those functions
requiring downloading of Unicode tables to work correctly.
Frink already has lots of powerful Unicode-aware string functions, which can
be seen in:
https://frinklang.org/#CorrectStringParsing
Much other Unicode testing of single characters can be done through the
Java class Character. For example:
callJava["java.lang.Character", "isWhitespace", char["\t"]]
which returns true.
The java.lang.Character class is disastrous and poorly-designed, though, with
classes like getType not returning consistent bitmapped values.
*/
class Unicode
{
/** A dictionary mapping from codepoint (as integer) -> codepoint name */
class var codepointNames = undef
/** A dictionary mapping from char->chars of confusable characters. */
class var confusablesDict = undef
/** A flag indicating if the "confusables" dictionary has been loaded. */
class var confusablesLoaded = false
/** The name of the Java class that knows characters. */
class var CHARCLASS = "java.lang.Character";
/** This returns the Unicode codepoint name for a codepoint (specified as an
integer or a string containing multiple codepoints) */
class getCodepointName[i] :=
{
if codepointNames == undef
loadCodepointNames[]
if isInteger[i]
return codepointNames@i
else
if isString[i]
{
result = new array
for c = chars[i]
result.push[codepointNames@c]
return result
} else
{
result = new array
for c = i
result.push[getCodepointName[c]]
return result
}
}
/** This searches the codepoint names for values that match a specific
regular expression. */
class searchNames[pattern] :=
{
if codepointNames == undef
loadCodepointNames[]
retval = new array
for [codepoint, name] = codepointNames
if name =~ pattern
retval.push[[codepoint, char[codepoint], name]]
// Sort by codepoint
return sort[retval, byColumn[0]]
}
/** This is a human-readable name search. It takes a regular expression
and returns names that match that pattern. */
class prettySearchNames[pattern] :=
{
ret = ""
for [dec, char, name] = Unicode.searchNames[pattern]
ret = ret + toASCIIHigh[char[dec]] + "\t" + char + "\t" + name + "\n"
return ret
}
/** This is a private method that loads the dictionary of codepoint names.
*/
class loadCodepointNames[] :=
{
codepointNames = new dict
min = staticJava[CHARCLASS, "MIN_CODE_POINT"]
// max = staticJava[CHARCLASS, "MAX_CODE_POINT"]
max = 0x1F9FF
for i = min to max
if callJava[CHARCLASS, "isDefined", i]
codepointNames@i = charName[i]
}
/** deconfuse[string]: This follows the procedure in Unicode Technical
Standard #39 for deconfusing similar characters in a string.
http://www.unicode.org/reports/tr39/
Specifically, section 4, "Confusable Detection".
It uses the "confusables" table available at:
http://www.unicode.org/Public/security/latest/confusables.txt
to perform the deconfusing.
You generally don't want to call this with a single string, but to call
deconfuse[x] == deconfuse[y]
to see if two separate strings are confusable. Or, if
deconfuse[x] != x
then you might have reason to believe that the string x had confusable
characters in it, and you might want to treat it with stronger security.
It's *still* a rather weak notion of confusable, as combining characters
and accents are not folded together. You may want to first use
normalizeUnicode[str] to perform some of this normalization.
You might want to use something stronger like folding Unicode to ASCII:
https://github.com/ericxtang/sunspot/blob/deafdd55f2a9534cc96471958ea1c206430832e7/sunspot/solr/solr/conf/mapping-FoldToASCII.txt
*/
class deconfuse[str] :=
{
nfd = normalizeUnicode[str, "NFD"]
loadConfusables[]
result = new array
for c = charList[nfd]
{
if confusablesDict.containsKey[c]
result.push[confusablesDict@c]
else
result.push[c]
}
return normalizeUnicode[join["", result], "NFD"]
}
/** Internal function to load the Unicode "confusables" file. */
class loadConfusables[] :=
{
if confusablesLoaded
return
// This is a dictionary from source char to target string.
confusablesDict = new dict
// TODO: Cache this file somewhere
for line = lines["http://www.unicode.org/Public/security/latest/confusables.txt"]
{
if line =~ %r/^\s*#/
next
if [source, target] = line =~ %r/([A-F0-9]{4,6})\s*;\s*([\sA-F0-9]+)/
{
target = trim[target]
sourceStr = char[parseInt[source, 16]]
targetStr = char[map[{|x| parseInt[x,16]},
split[" ", trim[target]]]]
confusablesDict@sourceStr = targetStr
// println["$sourceStr\t$targetStr"]
}
}
confusablesLoaded = true
}
}
/*
dumpChars[x] := println["$x\t" + uc[hex[char[x]]]]
original = "Inglês"
original = "\u2487" // This is a Unicode character indicating parenthesized 20
dumpChars[original]
deconfuse = Unicode.deconfuse[original]
dumpChars[deconfuse]
dumpChars[Unicode.deconfuse["(2O)"]] // This is a letter capital O, not a zero
*/
Download or view Unicode.frink in plain text format
This is a program written in the programming language Frink.
For more information, view the Frink
Documentation or see More Sample Frink Programs.
Alan Eliasen was born 20217 days, 23 hours, 51 minutes ago.