Size: 26917
Comment:
|
← Revision 48 as of 2025-04-19 09:07:42 ⇥
Size: 28765
Comment: various fixes and improvements
|
Deletions are marked like this. | Additions are marked like this. |
Line 15: | Line 15: |
`printf` in bash 4.2 and higher, and in ksh93, supports Unicode code points as well: {{{#!highlight bash # bash 4.2, ksh93 |
`printf` in bash 4.2 and higher, ksh93, zsh and the GNU standalone `printf` utility (the first to introduce it) supports Unicode code points as well: {{{#!highlight bash # bash 4.2, zsh, ksh93 |
Line 21: | Line 21: |
(for codepoints above U+FFFF, use `\Uxxxxxxxx` instead). In most `printf` implementations that support that extension including bash's, that prints the corresponding character using the correct byte value according to the character encoding specified in the locale (see output of `locale charmap`), but note that ksh93's `printf` outputs it encoded in UTF-8 whether or not that's the encoding used in the locale. For example: {{{ bash-5.2$ export LANG=en_US.UTF-8 bash-5.2$ locale charmap UTF-8 bash-5.2$ printf '\u20AC' | od -vtx1 0000000 e2 82 ac 0000003 bash-5.2$ export LANG=fr_FR.iso885915 bash-5.2$ locale charmap ISO-8859-15 bash-5.2$ printf '\u20AC' | od -vtx1 0000000 a4 0000001 }}} U+20AC (€) is encoded as the 0xA1 single byte in ISO-8859-15 and on three bytes (0xe2 0x82 0xac) in UTF-8. |
|
Line 31: | Line 55: |
Beware that for `$'\uXXXX'` and `$'\UXXXXXXXX'` in bash (contrary to zsh which first added those), it's the locale that was in force at the time the code was *read*, not *run* that is used to expand the character to the corresponding byte values. |
|
Line 44: | Line 70: |
Technically, ASCII only specifies characters for bytes 0 to 127. For values 128 to 255, that will still output the corresponding ''byte'' value, which won't correspond to a character in ASCII but may in the character encoding used in your locale. |
|
Line 59: | Line 87: |
LC_CTYPE=C printf %d "'$1" | LC_ALL= LC_CTYPE=C printf %d "'$1" |
Line 66: | Line 94: |
LC_CTYPE=C printf %x "'$1" | LC_ALL= LC_CTYPE=C printf %x "'$1" |
Line 70: | Line 98: |
# non-POSIX | |
Line 79: | Line 108: |
The {{{ord}}} function above is quite tricky. . ''Tricky? Rather, it's using a feature that I can't find documented anywhere -- putting a single quote in front of a character. Neat effect, but how on '''earth''' did you find out about it? Source diving? -- GreyCat'' . ''It validates The Single Unix Specification: "If the leading character is a single-quote or double-quote, the value shall be the numeric value in the underlying codeset of the character following the single-quote or double-quote." (see [[http://www.opengroup.org/onlinepubs/009695399/utilities/printf.html|printf()]] to know more) -- mjf'' |
The {{{ord}}} function above relies on the fact that `printf` for numeric values handles `'x'` values like the C language does where `'x'` is the value of character `x`. In ksh on which the POSIX specification of `sh` is based, you can use any arithemetic expression in there, and in its arithemtic expressions, `'x'` is handled like in `C`, and ksh is generally quite lax about unclosed quotes. ``` ksh$ printf '%d\n' " 'z' - 'a' + 1 " 26 ksh$ echo "$(( 'z' - 'a' + 1 ))" 26 ksh$ echo "$(( 'z ))" 122 ``` POSIX did not specify that for arithmetic expressions, but specified `'x'` for printf numeric arguments, not requiring the closing `'` and extending it to `"x"`. ''"If the leading character is a single-quote or double-quote, the value shall be the numeric value in the underlying codeset of the character following the single-quote or double-quote." (see [[http://www.opengroup.org/onlinepubs/009695399/utilities/printf.html|printf()]] to know more) -- mjf'' |
Line 187: | Line 229: |
printf -v "${1?Missing Dest Variable}" "${3:-%d}" "'${2?Missing Char}" | printf -v "${1?Missing Dest Variable}" -- "${3:-%d}" "'${2?Missing Char}" |
Line 202: | Line 244: |
printf "${2:-%d}" "'${1?Missing Char}" | printf -- "${2:-%d}" "'${1?Missing Char}" |
Line 251: | Line 293: |
printf -v val "\\%03o" "${2}" | printf -v val "\\%o" "${2}" |
Line 253: | Line 295: |
printf -v val "\\%03o" \ $(( (${2}>> 6) |0xc0 )) \ $(( ( ${2} &0x3f)|0x80 )) |
printf -v val "\\%o" \ "$(( (${2}>> 6) |0xc0 ))" \ "$(( ( ${2} &0x3f)|0x80 ))" |
Line 257: | Line 299: |
printf -v val "\\%03o" \ $(( ( ${2}>>12) |0xe0 )) \ $(( ((${2}>> 6)&0x3f)|0x80 )) \ $(( ( ${2} &0x3f)|0x80 )) |
printf -v val "\\%o" \ "$(( ( ${2}>>12) |0xe0 ))" \ "$(( ((${2}>> 6)&0x3f)|0x80 ))" \ "$(( ( ${2} &0x3f)|0x80 ))" |
Line 262: | Line 304: |
printf -v val "\\%03o" \ $(( ( ${2}>>18) |0xf0 )) \ $(( ((${2}>>12)&0x3f)|0x80 )) \ $(( ((${2}>> 6)&0x3f)|0x80 )) \ $(( ( ${2} &0x3f)|0x80 )) |
printf -v val "\\%o" \ "$(( ( ${2}>>18) |0xf0 ))" \ "$(( ((${2}>>12)&0x3f)|0x80 ))" \ "$(( ((${2}>> 6)&0x3f)|0x80 ))" \ "$(( ( ${2} &0x3f)|0x80 ))" |
Line 268: | Line 310: |
printf -v val "\\%03o" \ $(( ( ${2}>>24) |0xf8 )) \ $(( ((${2}>>18)&0x3f)|0x80 )) \ $(( ((${2}>>12)&0x3f)|0x80 )) \ $(( ((${2}>> 6)&0x3f)|0x80 )) \ $(( ( ${2} &0x3f)|0x80 )) |
printf -v val "\\%o" \ "$(( ( ${2}>>24) |0xf8 ))" \ "$(( ((${2}>>18)&0x3f)|0x80 ))" \ "$(( ((${2}>>12)&0x3f)|0x80 ))" \ "$(( ((${2}>> 6)&0x3f)|0x80 ))" \ "$(( ( ${2} &0x3f)|0x80 ))" |
Line 275: | Line 317: |
printf -v val "\\%03o" \ $(( ( ${2}>>30) |0xfc )) \ $(( ((${2}>>24)&0x3f)|0x80 )) \ $(( ((${2}>>18)&0x3f)|0x80 )) \ $(( ((${2}>>12)&0x3f)|0x80 )) \ $(( ((${2}>> 6)&0x3f)|0x80 )) \ $(( ( ${2} &0x3f)|0x80 )) |
printf -v val "\\%o" \ "$(( ( ${2}>>30) |0xfc ))" \ "$(( ((${2}>>24)&0x3f)|0x80 ))" \ "$(( ((${2}>>18)&0x3f)|0x80 ))" \ "$(( ((${2}>>12)&0x3f)|0x80 ))" \ "$(( ((${2}>> 6)&0x3f)|0x80 ))" \ "$(( ( ${2} &0x3f)|0x80 ))" |
Line 296: | Line 338: |
printf -v val "\\%03o\%03o" $(( (${2}>>6)|0xc0 )) $(( (${2}&0x3f)|0x80 )) | printf -v val "\\%o\\%o" "$(( (${2}>>6)|0xc0 ))" "$(( (${2}&0x3f)|0x80 ))" |
Line 300: | Line 342: |
printf -v ${1?Missing Dest Variable} ${val} | printf -v "${1?Missing Dest Variable}" "${val}" |
Line 310: | Line 352: |
printf -v val '\\x%02x' "${2}" printf -v ${1?Missing Dest Variable} ${val} |
printf -v val '\\%o' "${2}" printf -v ${1?Missing Dest Variable} "${val}" |
Line 314: | Line 356: |
if [ "${LC_CTYPE:-${LC_ALL:-}}" = "C" ]; then | if [ "${LC_ALL:-${LC_CTYPE:-}}" = "C" ]; then |
Line 323: | Line 365: |
chr "${1}" "${2#${2%%[!0]*}}" | chr "${1}" "${2#"${2%%[!0]*}"}" |
Line 329: | Line 371: |
chr "${1}" "0x${2#0x}" | chr "${1}" "0x${2#0[xX]}" |
Line 339: | Line 381: |
printf -v val '\\%03o\\%03o' $(( (${1}>>6)|0xc0 )) $(( (${1}&0x3f)|0x80 )) | printf -v val '\\%o\\%o' "$(( (${1}>>6)|0xc0 ))" "$(( (${1}&0x3f)|0x80 ))" |
Line 353: | Line 395: |
printf -v val '\\x%x' "${1}" | printf -v val '\\%o' "${1}" |
Line 357: | Line 399: |
if [ "${LC_CTYPE:-${LC_ALL:-}}" = "C" ]; then | if [ "${LC_ALL:-${LC_CTYPE:-}}" = "C" ]; then |
Line 366: | Line 408: |
chr_echo "${1#${1%%[!0]*}}" | chr_echo "${1#"${1%%[!0]*}"}" |
Line 372: | Line 414: |
chr_echo "0x${1#0x}" | chr_echo "0x${1#0[xX]}" |
Line 469: | Line 511: |
*) printf ':( Unexpected Input 0x%02x %q "%s"\n' "${asciiValue}" "${REPLY}" "${REPLY//[[:cntrl:]]}" ;; | *) printf>&2 ':( Unexpected Input 0x%02x %q "%s"\n' "${asciiValue}" "${REPLY}" "${REPLY//[[:cntrl:]]}" ;; |
Line 492: | Line 534: |
eval ${1}'=$(( ${#idx} +1 ))' | eval "${1}"'=$(( ${#idx} +1 ))' |
Line 498: | Line 540: |
#printf "EAsciiLookup2=(\n %s\n)" "$(for (( x=0x1; x<0x100 ; x++)); do printf '%-18s' "$(printf '[_%q]="0x%02x"' "$(printf "%b" "$(printf '\\x%02x' "$x")")" $x )" ; [ "$(($x%6))" != "0" ] || echo -en "\n " ; done)" | #printf "EAsciiLookup2=(\n %s\n)" "$(for (( x=0x1; x<0x100 ; x++)); do printf '%-18s' "$(printf '[_%q]="0x%02x"' "$(printf "%b" "$(printf '\\x%02x' "$x")")" $x )" ; [ "$(($x%6))" != "0" ] || printf "\n " ; done)" |
How do I convert an ASCII character to its decimal (or hexadecimal) value and back? How do I do URL encoding or URL decoding?
If you have a known octal or hexadecimal value (at script-writing time), you can just use printf:
In locales where the character encoding is a superset of ASCII, this prints the literal ' character (47 is the octal ASCII value of the apostrophe character) and a newline. The hexadecimal version can also be used with a few printf implementations including the bash builtin, but is not standard/POSIX.
printf in bash 4.2 and higher, ksh93, zsh and the GNU standalone printf utility (the first to introduce it) supports Unicode code points as well:
(for codepoints above U+FFFF, use \Uxxxxxxxx instead).
In most printf implementations that support that extension including bash's, that prints the corresponding character using the correct byte value according to the character encoding specified in the locale (see output of locale charmap), but note that ksh93's printf outputs it encoded in UTF-8 whether or not that's the encoding used in the locale.
For example:
bash-5.2$ export LANG=en_US.UTF-8 bash-5.2$ locale charmap UTF-8 bash-5.2$ printf '\u20AC' | od -vtx1 0000000 e2 82 ac 0000003 bash-5.2$ export LANG=fr_FR.iso885915 bash-5.2$ locale charmap ISO-8859-15 bash-5.2$ printf '\u20AC' | od -vtx1 0000000 a4 0000001
U+20AC (€) is encoded as the 0xA1 single byte in ISO-8859-15 and on three bytes (0xe2 0x82 0xac) in UTF-8.
Another approach: bash's $'...' quoting can be used to expand to the desired characters, either in a variable assignment, or directly as a command argument:
Beware that for $'\uXXXX' and $'\UXXXXXXXX' in bash (contrary to zsh which first added those), it's the locale that was in force at the time the code was *read*, not *run* that is used to expand the character to the corresponding byte values.
If you need to convert characters (or numeric ASCII values) that are not known in advance (i.e., in variables), you can use something a little more complicated. Note: These functions only work for single-byte character encodings.
Technically, ASCII only specifies characters for bytes 0 to 127. For values 128 to 255, that will still output the corresponding byte value, which won't correspond to a character in ASCII but may in the character encoding used in your locale.
Even better to avoid using a subshell is to pass the value inside a variable instead of the command output. faster as it avoids the subshell
1 ord() {
2 # POSIX
3 LC_ALL= LC_CTYPE=C printf %d "'$1"
4 }
5
6 # hex() - converts ASCII character to a hexadecimal value
7 # unhex() - converts a hexadecimal value to an ASCII character
8
9 hex() {
10 LC_ALL= LC_CTYPE=C printf %x "'$1"
11 }
12
13 unhex() {
14 # non-POSIX
15 printf "\\x$1"
16 }
17
18 # examples:
19
20 chr "$(ord A)" # -> A
21 ord "$(chr 65)" # -> 65
The ord function above relies on the fact that printf for numeric values handles 'x' values like the C language does where 'x' is the value of character x.
In ksh on which the POSIX specification of sh is based, you can use any arithemetic expression in there, and in its arithemtic expressions, 'x' is handled like in C, and ksh is generally quite lax about unclosed quotes.
` ksh$ printf '%d\n' " 'z' - 'a' + 1 " 26 ksh$ echo "$(( 'z' - 'a' + 1 ))" 26 ksh$ echo "$(( 'z ))" 122 `
POSIX did not specify that for arithmetic expressions, but specified 'x' for printf numeric arguments, not requiring the closing ' and extending it to "x".
"If the leading character is a single-quote or double-quote, the value shall be the numeric value in the underlying codeset of the character following the single-quote or double-quote." (see printf() to know more) -- mjf
URL encoding and URL decoding
Note that URL encoding is defined only at the byte (octet) level. A URL-encoding of a multibyte (e.g. UTF-8) character is done by simply encoding each byte individually, then concatenating everything.
Also note that the urldecode function below performs no error checking; getting it to yield a sensible error message when you feed it malformed input is left as an exercise for the reader.
1 # Alternative urlencode, prints all at once (requires bash 3.1)
2 urlencode() {
3 # urlencode <string>
4 local LC_ALL=C c i n=${#1}
5 local out= tmp
6 for (( i=0; i < n; i++ )); do
7 c=${1:i:1}
8 case $c in
9 [[:alnum:].~_-]) printf -v tmp %s "$c" ;;
10 *) printf -v tmp %%%02X "'$c" ;;
11 esac
12 out+=$tmp
13 done
14 printf %s "$out"
15 }
More complete examples (with UTF-8 support)
The command-line utility nkf can decode URLs:
1 echo 'https://ja.wikipedia.org/wiki/%E9%87%8E%E8%89%AF%E7%8C%AB' | nkf --url-input
Note about Ext Ascii and UTF-8 encoding
The following example was never peer-reviewed. Everyone is terrified of it. Proceed at your own risk.
- for values 0x00 - 0x7f Identical
for values 0x80 - 0xff conflict between UTF-8 & ExtAscii
- for values 0x100 - 0xffff Only UTF-8 UTF-16 UTF-32
- for values 0x100 - 0x7FFFFFFF Only UTF-8 UTF-32
value
EAscii
UTF-8
UTF-16
UTF-32
0x20
"\x20"
"\x20"
\u0020
\U00000020
0x20
"\x7f"
"\x7f"
\u007f
\U0000007f
0x80
"\x80"
"\xc2\x80"
\u0080
\U00000080
0xff
"\xff"
"\xc3\xbf"
\u00ff
\U000000ff
0x100
N/A
"\xc4\x80"
\u0100
\U00000100
0x1000
N/A
"\xc8\x80"
\u1000
\U00001000
0xffff
N/A
"\xef\xbf\xbf"
\uffff
\U0000ffff
0x10000
N/A
"\xf0\x90\x80\x80"
\ud800\udc00
\U00010000
0xfffff
N/A
"\xf3\xbf\xbf\xbf"
\udbbf\udfff
\U000fffff
0x10000000
N/A
"\xfc\x90\x80\x80\x80\x80"
N/A
\U10000000
0x7fffffff
N/A
"\xfd\xbf\xbf\xbf\xbf\xbf"
N/A
\U7fffffff
0x80000000
N/A
N/A
N/A
N/A
0xffffffff
N/A
N/A
N/A
N/A
1 ###########################################################################
2 ## ord family
3 ###########################################################################
4 # ord <Return Variable Name> <Char to convert> [Optional Format String]
5 # ord_hex <Return Variable Name> <Char to convert>
6 # ord_oct <Return Variable Name> <Char to convert>
7 # ord_utf8 <Return Variable Name> <Char to convert> [Optional Format String]
8 # ord_eascii <Return Variable Name> <Char to convert> [Optional Format String]
9 # ord_echo <Char to convert> [Optional Format String]
10 # ord_hex_echo <Char to convert>
11 # ord_oct_echo <Char to convert>
12 # ord_utf8_echo <Char to convert> [Optional Format String]
13 # ord_eascii_echo <Char to convert> [Optional Format String]
14 #
15 # Description:
16 # converts character using native encoding to its decimal value and stores
17 # it in the Variable specified
18 #
19 # ord
20 # ord_hex output in hex
21 # ord_hex output in octal
22 # ord_utf8 forces UTF8 decoding
23 # ord_eascii forces eascii decoding
24 # ord_echo prints to stdout
25 function ord {
26 printf -v "${1?Missing Dest Variable}" -- "${3:-%d}" "'${2?Missing Char}"
27 }
28 function ord_oct {
29 ord "${@:1:2}" "0%c"
30 }
31 function ord_hex {
32 ord "${@:1:2}" "0x%x"
33 }
34 function ord_utf8 {
35 LC_CTYPE=C.UTF-8 ord "${@}"
36 }
37 function ord_eascii {
38 LC_CTYPE=C ord "${@}"
39 }
40 function ord_echo {
41 printf -- "${2:-%d}" "'${1?Missing Char}"
42 }
43 function ord_oct_echo {
44 ord_echo "${1}" "0%o"
45 }
46 function ord_hex_echo {
47 ord_echo "${1}" "0x%x"
48 }
49 function ord_utf8_echo {
50 LC_CTYPE=C.UTF-8 ord_echo "${@}"
51 }
52 function ord_eascii_echo {
53 LC_CTYPE=C ord_echo "${@}"
54 }
55
56 ###########################################################################
57 ## chr family
58 ###########################################################################
59 # chr_utf8 <Return Variale Name> <Integer to convert>
60 # chr_eascii <Return Variale Name> <Integer to convert>
61 # chr <Return Variale Name> <Integer to convert>
62 # chr_oct <Return Variale Name> <Octal number to convert>
63 # chr_hex <Return Variale Name> <Hex number to convert>
64 # chr_utf8_echo <Integer to convert>
65 # chr_eascii_echo <Integer to convert>
66 # chr_echo <Integer to convert>
67 # chr_oct_echo <Octal number to convert>
68 # chr_hex_echo <Hex number to convert>
69 #
70 # Description:
71 # converts decimal value to character representation an stores
72 # it in the Variable specified
73 #
74 # chr Tries to guess output format
75 # chr_utf8 forces UTF8 encoding
76 # chr_eascii forces eascii encoding
77 # chr_echo prints to stdout
78 #
79 function chr_utf8_m {
80 local val
81 #
82 # bash only supports \u \U since 4.2
83 #
84
85 # here is an example how to encode
86 # manually
87 # this will work since Bash 3.1 as it uses -v.
88 #
89 if [[ ${2:?Missing Ordinal Value} -le 0x7f ]]; then
90 printf -v val "\\%o" "${2}"
91 elif [[ ${2} -le 0x7ff ]]; then
92 printf -v val "\\%o" \
93 "$(( (${2}>> 6) |0xc0 ))" \
94 "$(( ( ${2} &0x3f)|0x80 ))"
95 elif [[ ${2} -le 0xffff ]]; then
96 printf -v val "\\%o" \
97 "$(( ( ${2}>>12) |0xe0 ))" \
98 "$(( ((${2}>> 6)&0x3f)|0x80 ))" \
99 "$(( ( ${2} &0x3f)|0x80 ))"
100 elif [[ ${2} -le 0x1fffff ]]; then
101 printf -v val "\\%o" \
102 "$(( ( ${2}>>18) |0xf0 ))" \
103 "$(( ((${2}>>12)&0x3f)|0x80 ))" \
104 "$(( ((${2}>> 6)&0x3f)|0x80 ))" \
105 "$(( ( ${2} &0x3f)|0x80 ))"
106 elif [[ ${2} -le 0x3ffffff ]]; then
107 printf -v val "\\%o" \
108 "$(( ( ${2}>>24) |0xf8 ))" \
109 "$(( ((${2}>>18)&0x3f)|0x80 ))" \
110 "$(( ((${2}>>12)&0x3f)|0x80 ))" \
111 "$(( ((${2}>> 6)&0x3f)|0x80 ))" \
112 "$(( ( ${2} &0x3f)|0x80 ))"
113 elif [[ ${2} -le 0x7fffffff ]]; then
114 printf -v val "\\%o" \
115 "$(( ( ${2}>>30) |0xfc ))" \
116 "$(( ((${2}>>24)&0x3f)|0x80 ))" \
117 "$(( ((${2}>>18)&0x3f)|0x80 ))" \
118 "$(( ((${2}>>12)&0x3f)|0x80 ))" \
119 "$(( ((${2}>> 6)&0x3f)|0x80 ))" \
120 "$(( ( ${2} &0x3f)|0x80 ))"
121 else
122 printf -v "${1:?Missing Dest Variable}" ""
123 return 1
124 fi
125 printf -v "${1:?Missing Dest Variable}" "${val}"
126 }
127 function chr_utf8 {
128 local val
129 [[ ${2?Missing Ordinal Value} -lt 0x80000000 ]] || return 1
130
131 if [[ ${2} -lt 0x100 && ${2} -ge 0x80 ]]; then
132
133 # bash 4.2 incorrectly encodes
134 # \U000000ff as \xff so encode manually
135 printf -v val "\\%o\\%o" "$(( (${2}>>6)|0xc0 ))" "$(( (${2}&0x3f)|0x80 ))"
136 else
137 printf -v val '\\U%08x' "${2}"
138 fi
139 printf -v "${1?Missing Dest Variable}" "${val}"
140 }
141 function chr_eascii {
142 local val
143 # Make sure value less than 0x100
144 # otherwise we end up with
145 # \xVVNNNNN
146 # where \xVV = char && NNNNN is a number string
147 # so chr "0x44321" => "D321"
148 [[ ${2?Missing Ordinal Value} -lt 0x100 ]] || return 1
149 printf -v val '\\%o' "${2}"
150 printf -v ${1?Missing Dest Variable} "${val}"
151 }
152 function chr {
153 if [ "${LC_ALL:-${LC_CTYPE:-}}" = "C" ]; then
154 chr_eascii "${@}"
155 else
156 chr_utf8 "${@}"
157 fi
158 }
159 function chr_dec {
160 # strip leading 0s otherwise
161 # interpreted as Octal
162 chr "${1}" "${2#"${2%%[!0]*}"}"
163 }
164 function chr_oct {
165 chr "${1}" "0${2}"
166 }
167 function chr_hex {
168 chr "${1}" "0x${2#0[xX]}"
169 }
170 function chr_utf8_echo {
171 local val
172 [[ ${1?Missing Ordinal Value} -lt 0x80000000 ]] || return 1
173
174 if [[ ${1} -lt 0x100 && ${1} -ge 0x80 ]]; then
175
176 # bash 4.2 incorrectly encodes
177 # \U000000ff as \xff so encode manually
178 printf -v val '\\%o\\%o' "$(( (${1}>>6)|0xc0 ))" "$(( (${1}&0x3f)|0x80 ))"
179 else
180 printf -v val '\\U%08x' "${1}"
181 fi
182 printf "${val}"
183 }
184 function chr_eascii_echo {
185 local val
186 # Make sure value less than 0x100
187 # otherwise we end up with
188 # \xVVNNNNN
189 # where \xVV = char && NNNNN is a number string
190 # so chr "0x44321" => "D321"
191 [[ ${1?Missing Ordinal Value} -lt 0x100 ]] || return 1
192 printf -v val '\\%o' "${1}"
193 printf "${val}"
194 }
195 function chr_echo {
196 if [ "${LC_ALL:-${LC_CTYPE:-}}" = "C" ]; then
197 chr_eascii_echo "${@}"
198 else
199 chr_utf8_echo "${@}"
200 fi
201 }
202 function chr_dec_echo {
203 # strip leading 0s otherwise
204 # interpreted as Octal
205 chr_echo "${1#"${1%%[!0]*}"}"
206 }
207 function chr_oct_echo {
208 chr_echo "0${1}"
209 }
210 function chr_hex_echo {
211 chr_echo "0x${1#0[xX]}"
212 }
213
214 #
215 # Simple Validation code
216 #
217 function test_echo_func {
218 local Outcome _result
219 _result="$( "${1}" "${2}" )"
220 [ "${_result}" = "${3}" ] && Outcome="Pass" || Outcome="Fail"
221 printf "# %-20s %-6s => " "${1}" "${2}" "${_result}" "${3}"
222 printf "[ "%16q" = "%-16q"%-5s ] " "${_result}" "${3}" "(${3//[[:cntrl:]]/_})"
223 printf "%s\n" "${Outcome}"
224
225
226 }
227 function test_value_func {
228 local Outcome _result
229 "${1}" _result "${2}"
230 [ "${_result}" = "${3}" ] && Outcome="Pass" || Outcome="Fail"
231 printf "# %-20s %-6s => " "${1}" "${2}" "${_result}" "${3}"
232 printf "[ "%16q" = "%-16q"%-5s ] " "${_result}" "${3}" "(${3//[[:cntrl:]]/_})"
233 printf "%s\n" "${Outcome}"
234 }
235 test_echo_func chr_echo "$(ord_echo "A")" "A"
236 test_echo_func ord_echo "$(chr_echo "65")" "65"
237 test_echo_func chr_echo "$(ord_echo "ö")" "ö"
238 test_value_func chr "$(ord_echo "A")" "A"
239 test_value_func ord "$(chr_echo "65")" "65"
240 test_value_func chr "$(ord_echo "ö")" "ö"
241 # chr_echo 65 => [ A = A (A) ] Pass
242 # ord_echo A => [ 65 = 65 (65) ] Pass
243 # chr_echo 246 => [ $'\303\266' = $'\303\266' (ö) ] Pass
244 # chr 65 => [ A = A (A) ] Pass
245 # ord A => [ 65 = 65 (65) ] Pass
246 # chr 246 => [ $'\303\266' = $'\303\266' (ö) ] Pass
247 #
248
249
250 test_echo_func chr_echo "65" A
251 test_echo_func chr_echo "065" 5
252 test_echo_func chr_dec_echo "065" A
253 test_echo_func chr_oct_echo "65" 5
254 test_echo_func chr_hex_echo "65" e
255 test_value_func chr "65" A
256 test_value_func chr "065" 5
257 test_value_func chr_dec "065" A
258 test_value_func chr_oct "65" 5
259 test_value_func chr_hex "65" e
260 # chr_echo 65 => [ A = A (A) ] Pass
261 # chr_echo 065 => [ 5 = 5 (5) ] Pass
262 # chr_dec_echo 065 => [ A = A (A) ] Pass
263 # chr_oct_echo 65 => [ 5 = 5 (5) ] Pass
264 # chr_hex_echo 65 => [ e = e (e) ] Pass
265 # chr 65 => [ A = A (A) ] Pass
266 # chr 065 => [ 5 = 5 (5) ] Pass
267 # chr_dec 065 => [ A = A (A) ] Pass
268 # chr_oct 65 => [ 5 = 5 (5) ] Pass
269 # chr_hex 65 => [ e = e (e) ] Pass
270
271 #test_value_func chr 0xff $'\xff'
272 test_value_func chr_eascii 0xff $'\xff'
273 test_value_func chr_utf8 0xff $'\uff' # Note this fails because bash encodes it incorrectly
274 test_value_func chr_utf8 0xff $'\303\277'
275 test_value_func chr_utf8 0x100 $'\u100'
276 test_value_func chr_utf8 0x1000 $'\u1000'
277 test_value_func chr_utf8 0xffff $'\uffff'
278 # chr_eascii 0xff => [ $'\377' = $'\377' (�) ] Pass
279 # chr_utf8 0xff => [ $'\303\277' = $'\377' (�) ] Fail
280 # chr_utf8 0xff => [ $'\303\277' = $'\303\277' (ÿ) ] Pass
281 # chr_utf8 0x100 => [ $'\304\200' = $'\304\200' (Ā) ] Pass
282 # chr_utf8 0x1000 => [ $'\341\200\200' = $'\341\200\200' (က) ] Pass
283 # chr_utf8 0xffff => [ $'\357\277\277' = $'\357\277\277' (���) ] Pass
284 test_value_func ord_utf8 "A" 65
285 test_value_func ord_utf8 "ä" 228
286 test_value_func ord_utf8 $'\303\277' 255
287 test_value_func ord_utf8 $'\u100' 256
288
289
290
291 #########################################################
292 # to help debug problems try this
293 #########################################################
294 printf "%q\n" $'\xff' # => $'\377'
295 printf "%q\n" $'\uffff' # => $'\357\277\277'
296 printf "%q\n" "$(chr_utf8_echo 0x100)" # => $'\304\200'
297 #
298 # This can help a lot when it comes to diagnosing problems
299 # with read and or xterm program output
300 # I use it a lot in error case to create a human readable error message
301 # i.e.
302 echo "Enter type to test, Enter to continue"
303 while read -srN1 ; do
304 ord asciiValue "${REPLY}"
305 case "${asciiValue}" in
306 10) echo "Goodbye" ; break ;;
307 20|21|22) echo "Yay expected input" ;;
308 *) printf>&2 ':( Unexpected Input 0x%02x %q "%s"\n' "${asciiValue}" "${REPLY}" "${REPLY//[[:cntrl:]]}" ;;
309 esac
310 done
311
312 #########################################################
313 # More exotic approach 1
314 #########################################################
315 # I used to use this before I figured out the LC_CTYPE=C approach
316 # printf "EAsciiLookup=%q" "$(for (( x=0x0; x<0x100 ; x++)); do printf '%b' $(printf '\\x%02x' "$x"); done)"
317 EAsciiLookup=$'\001\002\003\004\005\006\a\b\t\n\v\f\r\016\017\020\021\022\023'
318 EAsciiLookup+=$'\024\025\026\027\030\031\032\E\034\035\036\037 !"#$%&\'()*+,-'
319 EAsciiLookup+=$'./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghi'
320 EAsciiLookup+=$'jklmnopqrstuvwxyz{|}~\177\200\201\202\203\204\205\206\207\210'
321 EAsciiLookup+=$'\211\212\213\214\215\216\217\220\221\222\223\224\225\226\227'
322 EAsciiLookup+=$'\230\231\232\233\234\235\236\237\240\241\242\243\244\245\246'
323 EAsciiLookup+=$'\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265'
324 EAsciiLookup+=$'\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304'
325 EAsciiLookup+=$'\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323'
326 EAsciiLookup+=$'\324\325\326\327\330\331\332\333\334\335\336\337\340\341\342'
327 EAsciiLookup+=$'\343\344\345\346\347\350\351\352\353\354\355\356\357\360\361'
328 EAsciiLookup+=$'\362\363\364\365\366\367\370\371\372\373\374\375\376\377'
329 function ord_eascii2 {
330 local idx="${EAsciiLookup%%${2:0:1}*}"
331 eval "${1}"'=$(( ${#idx} +1 ))'
332 }
333
334 #########################################################
335 # More exotic approach 2
336 #########################################################
337 #printf "EAsciiLookup2=(\n %s\n)" "$(for (( x=0x1; x<0x100 ; x++)); do printf '%-18s' "$(printf '[_%q]="0x%02x"' "$(printf "%b" "$(printf '\\x%02x' "$x")")" $x )" ; [ "$(($x%6))" != "0" ] || printf "\n " ; done)"
338 typeset -A EAsciiLookup2
339 EAsciiLookup2=(
340 [_$'\001']="0x01" [_$'\002']="0x02" [_$'\003']="0x03" [_$'\004']="0x04"
341 [_$'\005']="0x05" [_$'\006']="0x06" [_$'\a']="0x07" [_$'\b']="0x08"
342 [_$'\t']="0x09" [_'']="0x0a" [_$'\v']="0x0b" [_$'\f']="0x0c"
343 [_$'\r']="0x0d" [_$'\016']="0x0e" [_$'\017']="0x0f" [_$'\020']="0x10"
344 [_$'\021']="0x11" [_$'\022']="0x12" [_$'\023']="0x13" [_$'\024']="0x14"
345 [_$'\025']="0x15" [_$'\026']="0x16" [_$'\027']="0x17" [_$'\030']="0x18"
346 [_$'\031']="0x19" [_$'\032']="0x1a" [_$'\E']="0x1b" [_$'\034']="0x1c"
347 [_$'\035']="0x1d" [_$'\036']="0x1e" [_$'\037']="0x1f" [_\ ]="0x20"
348 [_\!]="0x21" [_\"]="0x22" [_\#]="0x23" [_\$]="0x24"
349 [_%]="0x25" [_\&]="0x26" [_\']="0x27" [_\(]="0x28"
350 [_\)]="0x29" [_\*]="0x2a" [_+]="0x2b" [_\,]="0x2c"
351 [_-]="0x2d" [_.]="0x2e" [_/]="0x2f" [_0]="0x30"
352 [_1]="0x31" [_2]="0x32" [_3]="0x33" [_4]="0x34"
353 [_5]="0x35" [_6]="0x36" [_7]="0x37" [_8]="0x38"
354 [_9]="0x39" [_:]="0x3a" [_\;]="0x3b" [_\<]="0x3c"
355 [_=]="0x3d" [_\>]="0x3e" [_\?]="0x3f" [_@]="0x40"
356 [_A]="0x41" [_B]="0x42" [_C]="0x43" [_D]="0x44"
357 [_E]="0x45" [_F]="0x46" [_G]="0x47" [_H]="0x48"
358 [_I]="0x49" [_J]="0x4a" [_K]="0x4b" [_L]="0x4c"
359 [_M]="0x4d" [_N]="0x4e" [_O]="0x4f" [_P]="0x50"
360 [_Q]="0x51" [_R]="0x52" [_S]="0x53" [_T]="0x54"
361 [_U]="0x55" [_V]="0x56" [_W]="0x57" [_X]="0x58"
362 [_Y]="0x59" [_Z]="0x5a" [_\[]="0x5b" #[_\\]="0x5c"
363 #[_\]]="0x5d"
364 [_\^]="0x5e" [__]="0x5f" [_\`]="0x60"
365 [_a]="0x61" [_b]="0x62" [_c]="0x63" [_d]="0x64"
366 [_e]="0x65" [_f]="0x66" [_g]="0x67" [_h]="0x68"
367 [_i]="0x69" [_j]="0x6a" [_k]="0x6b" [_l]="0x6c"
368 [_m]="0x6d" [_n]="0x6e" [_o]="0x6f" [_p]="0x70"
369 [_q]="0x71" [_r]="0x72" [_s]="0x73" [_t]="0x74"
370 [_u]="0x75" [_v]="0x76" [_w]="0x77" [_x]="0x78"
371 [_y]="0x79" [_z]="0x7a" [_\{]="0x7b" [_\|]="0x7c"
372 [_\}]="0x7d" [_~]="0x7e" [_$'\177']="0x7f" [_$'\200']="0x80"
373 [_$'\201']="0x81" [_$'\202']="0x82" [_$'\203']="0x83" [_$'\204']="0x84"
374 [_$'\205']="0x85" [_$'\206']="0x86" [_$'\207']="0x87" [_$'\210']="0x88"
375 [_$'\211']="0x89" [_$'\212']="0x8a" [_$'\213']="0x8b" [_$'\214']="0x8c"
376 [_$'\215']="0x8d" [_$'\216']="0x8e" [_$'\217']="0x8f" [_$'\220']="0x90"
377 [_$'\221']="0x91" [_$'\222']="0x92" [_$'\223']="0x93" [_$'\224']="0x94"
378 [_$'\225']="0x95" [_$'\226']="0x96" [_$'\227']="0x97" [_$'\230']="0x98"
379 [_$'\231']="0x99" [_$'\232']="0x9a" [_$'\233']="0x9b" [_$'\234']="0x9c"
380 [_$'\235']="0x9d" [_$'\236']="0x9e" [_$'\237']="0x9f" [_$'\240']="0xa0"
381 [_$'\241']="0xa1" [_$'\242']="0xa2" [_$'\243']="0xa3" [_$'\244']="0xa4"
382 [_$'\245']="0xa5" [_$'\246']="0xa6" [_$'\247']="0xa7" [_$'\250']="0xa8"
383 [_$'\251']="0xa9" [_$'\252']="0xaa" [_$'\253']="0xab" [_$'\254']="0xac"
384 [_$'\255']="0xad" [_$'\256']="0xae" [_$'\257']="0xaf" [_$'\260']="0xb0"
385 [_$'\261']="0xb1" [_$'\262']="0xb2" [_$'\263']="0xb3" [_$'\264']="0xb4"
386 [_$'\265']="0xb5" [_$'\266']="0xb6" [_$'\267']="0xb7" [_$'\270']="0xb8"
387 [_$'\271']="0xb9" [_$'\272']="0xba" [_$'\273']="0xbb" [_$'\274']="0xbc"
388 [_$'\275']="0xbd" [_$'\276']="0xbe" [_$'\277']="0xbf" [_$'\300']="0xc0"
389 [_$'\301']="0xc1" [_$'\302']="0xc2" [_$'\303']="0xc3" [_$'\304']="0xc4"
390 [_$'\305']="0xc5" [_$'\306']="0xc6" [_$'\307']="0xc7" [_$'\310']="0xc8"
391 [_$'\311']="0xc9" [_$'\312']="0xca" [_$'\313']="0xcb" [_$'\314']="0xcc"
392 [_$'\315']="0xcd" [_$'\316']="0xce" [_$'\317']="0xcf" [_$'\320']="0xd0"
393 [_$'\321']="0xd1" [_$'\322']="0xd2" [_$'\323']="0xd3" [_$'\324']="0xd4"
394 [_$'\325']="0xd5" [_$'\326']="0xd6" [_$'\327']="0xd7" [_$'\330']="0xd8"
395 [_$'\331']="0xd9" [_$'\332']="0xda" [_$'\333']="0xdb" [_$'\334']="0xdc"
396 [_$'\335']="0xdd" [_$'\336']="0xde" [_$'\337']="0xdf" [_$'\340']="0xe0"
397 [_$'\341']="0xe1" [_$'\342']="0xe2" [_$'\343']="0xe3" [_$'\344']="0xe4"
398 [_$'\345']="0xe5" [_$'\346']="0xe6" [_$'\347']="0xe7" [_$'\350']="0xe8"
399 [_$'\351']="0xe9" [_$'\352']="0xea" [_$'\353']="0xeb" [_$'\354']="0xec"
400 [_$'\355']="0xed" [_$'\356']="0xee" [_$'\357']="0xef" [_$'\360']="0xf0"
401 [_$'\361']="0xf1" [_$'\362']="0xf2" [_$'\363']="0xf3" [_$'\364']="0xf4"
402 [_$'\365']="0xf5" [_$'\366']="0xf6" [_$'\367']="0xf7" [_$'\370']="0xf8"
403 [_$'\371']="0xf9" [_$'\372']="0xfa" [_$'\373']="0xfb" [_$'\374']="0xfc"
404 [_$'\375']="0xfd" [_$'\376']="0xfe" [_$'\377']="0xff"
405 )
406 function ord_eascii3 {
407 local -i val="${EAsciiLookup2["_${2:0:1}"]-}"
408 if [ "${val}" -eq 0 ]; then
409 case "${2:0:1}" in
410 ]) val=0x5d ;;
411 \\) val=0x5c ;;
412 esac
413 fi
414 eval "${1}"'="${val}"'
415 }
416 # for fun check out the following
417 time for (( i=0 ; i <1000; i++ )); do ord TmpVar 'a'; done
418 # real 0m0.065s
419 # user 0m0.048s
420 # sys 0m0.000s
421
422 time for (( i=0 ; i <1000; i++ )); do ord_eascii TmpVar 'a'; done
423 # real 0m0.239s
424 # user 0m0.188s
425 # sys 0m0.000s
426
427 time for (( i=0 ; i <1000; i++ )); do ord_utf8 TmpVar 'a'; done
428 # real 0m0.225s
429 # user 0m0.180s
430 # sys 0m0.000s
431
432 time for (( i=0 ; i <1000; i++ )); do ord_eascii2 TmpVar 'a'; done
433 # real 0m1.507s
434 # user 0m1.056s
435 # sys 0m0.012s
436
437 time for (( i=0 ; i <1000; i++ )); do ord_eascii3 TmpVar 'a'; done
438 # real 0m0.147s
439 # user 0m0.120s
440 # sys 0m0.000s
441
442 time for (( i=0 ; i <1000; i++ )); do ord_echo 'a' >/dev/null ; done
443 # real 0m0.065s
444 # user 0m0.044s
445 # sys 0m0.016s
446
447 time for (( i=0 ; i <1000; i++ )); do ord_eascii_echo 'a' >/dev/null ; done
448 # real 0m0.089s
449 # user 0m0.068s
450 # sys 0m0.008s
451
452 time for (( i=0 ; i <1000; i++ )); do ord_utf8_echo 'a' >/dev/null ; done
453 # real 0m0.226s
454 # user 0m0.172s
455 # sys 0m0.012s