From 14b4ba83c35c34f4a1f3a69c9967f502ee2d6528 Mon Sep 17 00:00:00 2001 From: Andrew Janke Date: Sun, 9 Aug 2015 16:28:47 -0400 Subject: [PATCH] Move urlencode/urldecode functions to core lib --- lib/functions.zsh | 134 ++++++++++++++++++++++++++++++++++++++++++++ lib/termsupport.zsh | 33 +---------- 2 files changed, 135 insertions(+), 32 deletions(-) diff --git a/lib/functions.zsh b/lib/functions.zsh index 17f5f9cb..5c1a5a28 100644 --- a/lib/functions.zsh +++ b/lib/functions.zsh @@ -73,3 +73,137 @@ function env_default() { env | grep -q "^$1=" && return 0 export "$1=$2" && return 3 } + + +# Required for $langinfo +zmodload zsh/langinfo + +# URL-encode a string +# +# Encodes a string using RFC 2396 URL-encoding (%-escaped). +# See: https://www.ietf.org/rfc/rfc2396.txt +# +# By default, reserved characters and unreserved "mark" characters are +# not escaped by this function. This allows the common usage of passing +# an entire URL in, and encoding just special characters in it, with +# the expectation that reserved and mark characters are used appropriately. +# The -r and -m options turn on escaping of the reserved and mark characters, +# respectively, which allows arbitrary strings to be fully escaped for +# embedding inside URLs, where reserved characters might be misinterpreted. +# +# Prints the encoded string on stdout. +# Returns nonzero if encoding failed. +# +# Usage: +# omz_urlencode [-r] [-m] +# +# -r causes reserved characters (;/?:@&=+$,) to be escaped +# +# -m causes "mark" characters (_.!~*''()-) to be escaped +# +# -P causes spaces to be encoded as '%20' instead of '+' +function omz_urlencode() { + emulate -L zsh + zparseopts -D -E -a opts r m P + + local in_str=$1 + local url_str="" + local spaces_as_plus + if [[ -z $opts[(r)-P] ]]; then spaces_as_plus=1; fi + local str="$in_str" + + # URLs must use UTF-8 encoding; convert str to UTF-8 if required + local encoding=$langinfo[CODESET] + local safe_encodings + safe_encodings=(UTF-8 utf8 US-ASCII) + if [[ -z ${safe_encodings[(r)$encoding]} ]]; then + str=$(echo -E "$str" | iconv -f $encoding -t UTF-8) + if [[ $? != 0 ]]; then + echo "Error converting string from $encoding to UTF-8" >&2 + return 1 + fi + fi + + # Use LC_CTYPE=C to process text byte-by-byte + local i byte ord LC_ALL=C + export LC_ALL + local reserved=';/?:@&=+$,' + local mark='_.!~*''()-' + local dont_escape="[A-Za-z0-9" + if [[ -z $opts[(r)-r] ]]; then + dont_escape+=$reserved + fi + # $mark must be last because of the "-" + if [[ -z $opts[(r)-m] ]]; then + dont_escape+=$mark + fi + dont_escape+="]" + + # Implemented to use a single printf call and avoid subshells in the loop, + # for performance (primarily on Windows). + local url_str="" + for (( i = 1; i <= ${#str}; ++i )); do + byte="$str[i]" + if [[ "$byte" =~ "$dont_escape" ]]; then + url_str+="$byte" + else + if [[ "$byte" == " " && -n $spaces_as_plus ]]; then + url_str+="+" + else + ord=$(( [##16] #byte )) + url_str+="%$ord" + fi + fi + done + echo -E "$url_str" +} + +# URL-decode a string +# +# Decodes a RFC 2396 URL-encoded (%-escaped) string. +# This decodes the '+' and '%' escapes in the input string, and leaves +# other characters unchanged. Does not enforce that the input is a +# valid URL-encoded string. This is a convenience to allow callers to +# pass in a full URL or similar strings and decode them for human +# presentation. +# +# Outputs the encoded string on stdout. +# Returns nonzero if encoding failed. +# +# Usage: +# omz_urldecode - prints decoded string followed by a newline +function omz_urldecode { + emulate -L zsh + local encoded_url=$1 + + echo -e input $1 + # Work bytewise, since URLs escape UTF-8 octets + local caller_encoding=$langinfo[CODESET] + local LC_ALL=C + export LC_ALL + + # Change + back to ' ' + local tmp=${encoded_url:gs/+/ /} + # Protect other escapes to pass through the printf unchanged + tmp=${tmp:gs/\\/\\\\/} + # Handle %-escapes by turning them into `\xXX` printf escapes + tmp=${tmp:gs/%/\\x/} + echo -E "before decode $tmp" + local decoded + eval "decoded=\$'$tmp'" + + # Now we have a UTF-8 encoded string in the variable. We need to re-encode + # it if caller is in a non-UTF-8 locale. + local safe_encodings + safe_encodings=(UTF-8 utf8 US-ASCII) + if [[ -z ${safe_encodings[(r)$caller_encoding]} ]]; then + decoded=$(echo -E "$decoded" | iconv -f UTF-8 -t $caller_encoding) + if [[ $? != 0 ]]; then + echo "Error converting string from UTF-8 to $caller_encoding" >&2 + return 1 + fi + fi + + echo -E "$decoded" +} + diff --git a/lib/termsupport.zsh b/lib/termsupport.zsh index 52622f5a..726cdce4 100644 --- a/lib/termsupport.zsh +++ b/lib/termsupport.zsh @@ -59,44 +59,13 @@ preexec_functions+=(omz_termsupport_preexec) if [[ "$TERM_PROGRAM" == "Apple_Terminal" ]] && [[ -z "$INSIDE_EMACS" ]]; then - # URL-encodes a string - # Outputs the encoded string on stdout - # Returns nonzero if encoding failed - function _omz_urlencode() { - local str=$1 - local url_str="" - - # URLs must use UTF-8 encoding; convert if required - local encoding=${LC_CTYPE/*./} - if [[ -n $encoding && $encoding != UTF-8 && $encoding != utf8 ]]; then - str=$(echo $str | iconv -f $encoding -t UTF-8) - if [[ $? != 0 ]]; then - echo "Error converting string from $encoding to UTF-8" >&2 - return 1 - fi - fi - - # Use LC_CTYPE=C to process text byte-by-byte - local i ch hexch LC_CTYPE=C - for ((i = 1; i <= ${#str}; ++i)); do - ch="$str[i]" - if [[ "$ch" =~ [/._~A-Za-z0-9-] ]]; then - url_str+="$ch" - else - hexch=$(printf "%02X" "'$ch") - url_str+="%$hexch" - fi - done - echo $url_str - } - # Emits the control sequence to notify Terminal.app of the cwd function update_terminalapp_cwd() { # Identify the directory using a "file:" scheme URL, including # the host name to disambiguate local vs. remote paths. # Percent-encode the pathname. - local URL_PATH=$(_omz_urlencode $PWD) + local URL_PATH=$(omz_urlencode -P $PWD) [[ $? != 0 ]] && return 1 local PWD_URL="file://$HOST$URL_PATH" # Undocumented Terminal.app-specific control sequence