1 /** 2 * Copyright 2014 Internet2 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 /* 17 * Copyright 2001-2004 The Apache Software Foundation. 18 * 19 * Licensed under the Apache License, Version 2.0 (the "License"); 20 * you may not use this file except in compliance with the License. 21 * You may obtain a copy of the License at 22 * 23 * http://www.apache.org/licenses/LICENSE-2.0 24 * 25 * Unless required by applicable law or agreed to in writing, software 26 * distributed under the License is distributed on an "AS IS" BASIS, 27 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 28 * See the License for the specific language governing permissions and 29 * limitations under the License. 30 */ 31 32 package edu.internet2.middleware.grouperClientExt.org.apache.commons.codec.language; 33 34 import edu.internet2.middleware.grouperClientExt.org.apache.commons.codec.EncoderException; 35 import edu.internet2.middleware.grouperClientExt.org.apache.commons.codec.StringEncoder; 36 37 /** 38 * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a 39 * general purpose scheme to find word with similar phonemes. 40 * 41 * @author Apache Software Foundation 42 * @version $Id: Soundex.java,v 1.1 2008-11-30 10:57:28 mchyzer Exp $ 43 */ 44 public class Soundex implements StringEncoder { 45 46 /** 47 * An instance of Soundex using the US_ENGLISH_MAPPING mapping. 48 * 49 * @see #US_ENGLISH_MAPPING 50 */ 51 public static final Soundexleware/grouperClientExt/org/apache/commons/codec/language/Soundex.html#Soundex">Soundex US_ENGLISH = new Soundex(); 52 53 /** 54 * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position 55 * means do not encode. 56 * <p> 57 * (This constant is provided as both an implementation convenience and to allow Javadoc to pick 58 * up the value for the constant values page.) 59 * </p> 60 * 61 * @see #US_ENGLISH_MAPPING 62 */ 63 public static final String US_ENGLISH_MAPPING_STRING = "01230120022455012623010202"; 64 65 /** 66 * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position 67 * means do not encode. 68 * 69 * @see Soundex#Soundex(char[]) 70 */ 71 public static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray(); 72 73 /** 74 * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This 75 * return value ranges from 0 through 4: 0 indicates little or no similarity, and 4 indicates strong similarity or 76 * identical values. 77 * 78 * @param s1 79 * A String that will be encoded and compared. 80 * @param s2 81 * A String that will be encoded and compared. 82 * @return The number of characters in the two encoded Strings that are the same from 0 to 4. 83 * 84 * @see SoundexUtils#difference(StringEncoder,String,String) 85 * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS 86 * T-SQL DIFFERENCE </a> 87 * 88 * @throws EncoderException 89 * if an error occurs encoding one of the strings 90 * @since 1.3 91 */ 92 public int difference(String s1, String s2) throws EncoderException { 93 return SoundexUtils.difference(this, s1, s2); 94 } 95 96 /** 97 * The maximum length of a Soundex code - Soundex codes are only four characters by definition. 98 * 99 * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0. 100 */ 101 private int maxLength = 4; 102 103 /** 104 * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each 105 * letter is mapped. This implementation contains a default map for US_ENGLISH 106 */ 107 private char[] soundexMapping; 108 109 /** 110 * Creates an instance using US_ENGLISH_MAPPING 111 * 112 * @see Soundex#Soundex(char[]) 113 * @see Soundex#US_ENGLISH_MAPPING 114 */ 115 public Soundex() { 116 this(US_ENGLISH_MAPPING); 117 } 118 119 /** 120 * Creates a soundex instance using the given mapping. This constructor can be used to provide an internationalized 121 * mapping for a non-Western character set. 122 * 123 * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each 124 * letter is mapped. This implementation contains a default map for US_ENGLISH 125 * 126 * @param mapping 127 * Mapping array to use when finding the corresponding code for a given character 128 */ 129 public Soundex(char[] mapping) { 130 this.setSoundexMapping(mapping); 131 } 132 133 /** 134 * Encodes an Object using the soundex algorithm. This method is provided in order to satisfy the requirements of 135 * the Encoder interface, and will throw an EncoderException if the supplied object is not of type java.lang.String. 136 * 137 * @param pObject 138 * Object to encode 139 * @return An object (or type java.lang.String) containing the soundex code which corresponds to the String 140 * supplied. 141 * @throws EncoderException 142 * if the parameter supplied is not of type java.lang.String 143 * @throws IllegalArgumentException 144 * if a character is not mapped 145 */ 146 public Object encode(Object pObject) throws EncoderException { 147 if (!(pObject instanceof String)) { 148 throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String"); 149 } 150 return soundex((String) pObject); 151 } 152 153 /** 154 * Encodes a String using the soundex algorithm. 155 * 156 * @param pString 157 * A String object to encode 158 * @return A Soundex code corresponding to the String supplied 159 * @throws IllegalArgumentException 160 * if a character is not mapped 161 */ 162 public String encode(String pString) { 163 return soundex(pString); 164 } 165 166 /** 167 * Used internally by the SoundEx algorithm. 168 * 169 * Consonants from the same code group separated by W or H are treated as one. 170 * 171 * @param str 172 * the cleaned working string to encode (in upper case). 173 * @param index 174 * the character position to encode 175 * @return Mapping code for a particular character 176 * @throws IllegalArgumentException 177 * if the character is not mapped 178 */ 179 private char getMappingCode(String str, int index) { 180 char mappedChar = this.map(str.charAt(index)); 181 // HW rule check 182 if (index > 1 && mappedChar != '0') { 183 char hwChar = str.charAt(index - 1); 184 if ('H' == hwChar || 'W' == hwChar) { 185 char preHWChar = str.charAt(index - 2); 186 char firstCode = this.map(preHWChar); 187 if (firstCode == mappedChar || 'H' == preHWChar || 'W' == preHWChar) { 188 return 0; 189 } 190 } 191 } 192 return mappedChar; 193 } 194 195 /** 196 * Returns the maxLength. Standard Soundex 197 * 198 * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0. 199 * @return int 200 */ 201 public int getMaxLength() { 202 return this.maxLength; 203 } 204 205 /** 206 * Returns the soundex mapping. 207 * 208 * @return soundexMapping. 209 */ 210 private char[] getSoundexMapping() { 211 return this.soundexMapping; 212 } 213 214 /** 215 * Maps the given upper-case character to it's Soudex code. 216 * 217 * @param ch 218 * An upper-case character. 219 * @return A Soundex code. 220 * @throws IllegalArgumentException 221 * Thrown if <code>ch</code> is not mapped. 222 */ 223 private char map(char ch) { 224 int index = ch - 'A'; 225 if (index < 0 || index >= this.getSoundexMapping().length) { 226 throw new IllegalArgumentException("The character is not mapped: " + ch); 227 } 228 return this.getSoundexMapping()[index]; 229 } 230 231 /** 232 * Sets the maxLength. 233 * 234 * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0. 235 * @param maxLength 236 * The maxLength to set 237 */ 238 public void setMaxLength(int maxLength) { 239 this.maxLength = maxLength; 240 } 241 242 /** 243 * Sets the soundexMapping. 244 * 245 * @param soundexMapping 246 * The soundexMapping to set. 247 */ 248 private void setSoundexMapping(char[] soundexMapping) { 249 this.soundexMapping = soundexMapping; 250 } 251 252 /** 253 * Retreives the Soundex code for a given String object. 254 * 255 * @param str 256 * String to encode using the Soundex algorithm 257 * @return A soundex code for the String supplied 258 * @throws IllegalArgumentException 259 * if a character is not mapped 260 */ 261 public String soundex(String str) { 262 if (str == null) { 263 return null; 264 } 265 str = SoundexUtils.clean(str); 266 if (str.length() == 0) { 267 return str; 268 } 269 char out[] = {'0', '0', '0', '0'}; 270 char last, mapped; 271 int incount = 1, count = 1; 272 out[0] = str.charAt(0); 273 last = getMappingCode(str, 0); 274 while ((incount < str.length()) && (count < out.length)) { 275 mapped = getMappingCode(str, incount++); 276 if (mapped != 0) { 277 if ((mapped != '0') && (mapped != last)) { 278 out[count++] = mapped; 279 } 280 last = mapped; 281 } 282 } 283 return new String(out); 284 } 285 286 }