| URLEncoding.java |
/*
* $Id: URLEncoding.java,v 1.34 2010/09/29 17:21:48 agoubard Exp $
*
* See the COPYRIGHT file for redistribution and use restrictions.
*/
package org.xins.common.text;
import java.io.ByteArrayOutputStream;
import java.io.UnsupportedEncodingException;
import org.xins.common.MandatoryArgumentChecker;
import org.xins.common.Utils;
/**
* URL encoding utility functions with Unicode support. This class supports
* both encoding and decoding. All characters higher than 127 will be encoded
* as %uxxxx where xxxx is the Unicode value of the character in hexadecimal.
*
* @version $Revision: 1.34 $ $Date: 2010/09/29 17:21:48 $
* @author <a href="mailto:ernst@ernstdehaan.com">Ernst de Haan</a>
* @author <a href="mailto:anthony.goubard@japplis.com">Anthony Goubard</a>
*
* @since XINS 1.0.0
*/
public final class URLEncoding {
/**
* The character zero (<code>'0'</code>) as an <code>int</code>.
*/
private static final int CHAR_ZERO = (int) '0';
/**
* The character nine (<code>'9'</code>) as an <code>int</code>.
*/
private static final int CHAR_NINE = (int) '9';
/**
* The character lowercase A (<code>'a'</code>) as an <code>int</code>.
*/
private static final int CHAR_LOWER_A = (int) 'a';
/**
* The character lowercase F (<code>'f'</code>) as an <code>int</code>.
*/
private static final int CHAR_LOWER_F = (int) 'f';
/**
* The character uppercase A (<code>'A'</code>) as an <code>int</code>.
*/
private static final int CHAR_UPPER_A = (int) 'A';
/**
* The character uppercase F (<code>'F'</code>) as an <code>int</code>.
*/
private static final int CHAR_UPPER_F = (int) 'F';
/**
* Mappings from unencoded (array index) to encoded values (array
* elements). The size of this array is 127.
*/
private static final String[] UNENCODED_TO_ENCODED;
static {
UNENCODED_TO_ENCODED = new String[255];
for (int i = 0; i < 255; i++) {
char c = (char) i;
if ((c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') ||
(c >= '0' && c <= '9') ||
(c == '-') ||
(c == '_') ||
(c == '.') ||
(c == '*')) {
UNENCODED_TO_ENCODED[i] = String.valueOf(c);
} else if (c == ' ') {
UNENCODED_TO_ENCODED[i] = "+";
} else {
char[] data = new char[3];
data[0] = '%';
data[1] = Character.toUpperCase(Character.forDigit((i >> 4) & 0xF, 16));
data[2] = Character.toUpperCase(Character.forDigit( i & 0xF, 16));
UNENCODED_TO_ENCODED[i] = new String(data);
}
}
}
/**
* Constructs a new <code>URLEncoding</code> object.
*/
private URLEncoding() {
// empty
}
/**
* URL encodes the specified character string as specified by W3C.
* http://www.w3.org/International/O-URL-code.html
*
* @param s
* the string to URL encode, not <code>null</code>.
*
* @return
* URL encoded version of the specified character string, never
* <code>null</code>.
*
* @throws IllegalArgumentException
* if <code>s == null</code>
*/
public static String encode(String s)
throws IllegalArgumentException {
// Check preconditions
MandatoryArgumentChecker.check("s", s);
// Short-circuit if the string is empty
int length = s.length();
if (length < 1) {
return "";
}
// Construct a buffer
StringBuffer buffer = new StringBuffer(length * 2);
// Loop through the string and just append whatever we find
// in UNENCODED_TO_ENCODED or if c > 127, encode the UTF-8 value
// of the character (cf http://www.w3.org/International/O-URL-code.html).
char[] content = s.toCharArray();
for (int i = 0; i < length; i++) {
int c = (int) content[i];
if (c < 128) {
buffer.append(UNENCODED_TO_ENCODED[c]);
} else if (c <= 0x07FF) { // non-ASCII <= 0x7FF
buffer.append('%');
buffer.append(Integer.toHexString(0xc0 | (c >> 6)));
buffer.append('%');
buffer.append(Integer.toHexString(0x80 | (c & 0x3F)));
} else { // 0x7FF < c <= 0xFFFF
buffer.append('%');
buffer.append(Integer.toHexString(0xe0 | (c >> 12)));
buffer.append('%');
buffer.append(Integer.toHexString(0x80 | ((c >> 6) & 0x3F)));
buffer.append('%');
buffer.append(Integer.toHexString(0x80 | (c & 0x3F)));
}
}
return buffer.toString();
}
/**
* Decodes the specified URL encoded character string.
* http://www.w3.org/International/O-URL-code.html
*
* @param s
* the URL encoded string to decode, not <code>null</code>.
*
* @return
* unencoded version of the specified URL encoded character string,
* never <code>null</code>.
*
* @throws IllegalArgumentException
* if <code>s == null</code>.
*
* @throws FormatException
* if any of the following conditions is true:
* <ul>
* <li><code>s.{@link String#charAt(int) charAt}(s.{@link String#length() length}() - 1)</code>
* (last character is a percentage sign)
* <li><code>s.{@link String#charAt(int) charAt}(s.{@link String#length() length}() - 2)</code>
* (before-last character is a percentage sign)
* <li><code>s.{@link String#charAt(int) charAt}(<em>n</em>) == '%'
* && !( {@link org.xins.common.text.HexConverter}.{@link org.xins.common.text.HexConverter#isHexDigit(char) isDigit}(s.{@link String#charAt(int) charAt}(<em>n</em> + 1))
* && {@link org.xins.common.text.HexConverter}.{@link org.xins.common.text.HexConverter#isHexDigit(char) isDigit}(s.{@link String#charAt(int) charAt}(<em>n</em> + 2)))</code>
* (percentage sign is followed by 2 characters of which at least one is not a hexadecimal digit)
* </ul>
*/
public static String decode(String s)
throws IllegalArgumentException, FormatException {
// Check preconditions
MandatoryArgumentChecker.check("s", s);
// If the string is empty, return the original string
int length = s.length();
if (length == 0) {
return s;
}
// Avoid calls to charAt() method.
char[] string = s.toCharArray();
// Loop through the string
StringBuffer buffer = new StringBuffer(length * 2);
int index = 0;
while (index < length) {
// Get the character
char c = string[index];
int charAsInt = (int) c;
// Special case: Recognize plus sign as a space
if (c == '+') {
buffer.append(' ');
// Catch encoded characters
} else if (c == '%') {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
while (index < length && string[index] == '%') {
if (index >= length - 2) {
throw new FormatException(s, "Character at position " + index + " has invalid value " + charAsInt + '.');
}
charAsInt = (int) string[++index];
int decodedValue = digit(charAsInt, s, index);
decodedValue *= 16;
charAsInt = (int) string[++index];
decodedValue += digit(charAsInt, s, index);
baos.write((int) decodedValue);
index++;
}
try {
buffer.append(baos.toString("UTF-8"));
} catch (UnsupportedEncodingException uee) {
Utils.logProgrammingError(uee);
}
// Back to the last position
index--;
// Append the character
} else {
buffer.append(c);
}
// Proceed to the next character
index++;
}
return buffer.toString();
}
/**
* Convert a hexadecimal digit to a number.
*
* @param charAsInt
* the hexadecimal digit.
*
* @param s
* the String from which the character has been taken.
*
* @param index
* the position of the character within the String.
*
* @return
* the converted character converted to an int.
*
* @throws FormatException
* if c is not a numerical digit or a letter between 'a' and 'f' or
* 'A' or 'F'.
*/
private static int digit(int charAsInt, String s, int index) throws FormatException {
int decodedValue;
if (charAsInt >= CHAR_ZERO && charAsInt <= CHAR_NINE) {
decodedValue = charAsInt - CHAR_ZERO;
} else if (charAsInt >= CHAR_LOWER_A && charAsInt <= CHAR_LOWER_F) {
decodedValue = charAsInt - CHAR_LOWER_A + 10;
} else if (charAsInt >= CHAR_UPPER_A && charAsInt <= CHAR_UPPER_F) {
decodedValue = charAsInt - CHAR_UPPER_A + 10;
} else {
throw new FormatException(s, "Character at position " + index + " is not a hex digit. Value is " + charAsInt + '.');
}
return decodedValue;
}
}