View Javadoc
1   /**
2    * Copyright 2014 Internet2
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *   http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  /*
17   * Copyright 2001-2004 The Apache Software Foundation.
18   * 
19   * Licensed under the Apache License, Version 2.0 (the "License");
20   * you may not use this file except in compliance with the License.
21   * You may obtain a copy of the License at
22   * 
23   *      http://www.apache.org/licenses/LICENSE-2.0
24   * 
25   * Unless required by applicable law or agreed to in writing, software
26   * distributed under the License is distributed on an "AS IS" BASIS,
27   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
28   * See the License for the specific language governing permissions and
29   * limitations under the License.
30   */ 
31  
32  package edu.internet2.middleware.grouperClientExt.org.apache.commons.codec.language;
33  
34  import edu.internet2.middleware.grouperClientExt.org.apache.commons.codec.EncoderException;
35  import edu.internet2.middleware.grouperClientExt.org.apache.commons.codec.StringEncoder;
36  
37  /**
38   * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a
39   * general purpose scheme to find word with similar phonemes.
40   * 
41   * @author Apache Software Foundation
42   * @version $Id: Soundex.java,v 1.1 2008-11-30 10:57:28 mchyzer Exp $
43   */
44  public class Soundex implements StringEncoder {
45  
46      /**
47       * An instance of Soundex using the US_ENGLISH_MAPPING mapping.
48       * 
49       * @see #US_ENGLISH_MAPPING
50       */
51      public static final Soundexleware/grouperClientExt/org/apache/commons/codec/language/Soundex.html#Soundex">Soundex US_ENGLISH = new Soundex();
52  
53      /**
54       * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position
55       * means do not encode.
56       * <p>
57       * (This constant is provided as both an implementation convenience and to allow Javadoc to pick
58       * up the value for the constant values page.)
59       * </p>
60       * 
61       * @see #US_ENGLISH_MAPPING
62       */
63      public static final String US_ENGLISH_MAPPING_STRING = "01230120022455012623010202";
64  
65      /**
66       * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position
67       * means do not encode.
68       * 
69       * @see Soundex#Soundex(char[])
70       */
71      public static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray();
72  
73      /**
74       * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This
75       * return value ranges from 0 through 4: 0 indicates little or no similarity, and 4 indicates strong similarity or
76       * identical values.
77       * 
78       * @param s1
79       *                  A String that will be encoded and compared.
80       * @param s2
81       *                  A String that will be encoded and compared.
82       * @return The number of characters in the two encoded Strings that are the same from 0 to 4.
83       * 
84       * @see SoundexUtils#difference(StringEncoder,String,String)
85       * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS
86       *          T-SQL DIFFERENCE </a>
87       * 
88       * @throws EncoderException
89       *                  if an error occurs encoding one of the strings
90       * @since 1.3
91       */
92      public int difference(String s1, String s2) throws EncoderException {
93          return SoundexUtils.difference(this, s1, s2);
94      }
95  
96      /**
97       * The maximum length of a Soundex code - Soundex codes are only four characters by definition.
98       * 
99       * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
100      */
101     private int maxLength = 4;
102 
103     /**
104      * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
105      * letter is mapped. This implementation contains a default map for US_ENGLISH
106      */
107     private char[] soundexMapping;
108 
109     /**
110      * Creates an instance using US_ENGLISH_MAPPING
111      * 
112      * @see Soundex#Soundex(char[])
113      * @see Soundex#US_ENGLISH_MAPPING
114      */
115     public Soundex() {
116         this(US_ENGLISH_MAPPING);
117     }
118 
119     /**
120      * Creates a soundex instance using the given mapping. This constructor can be used to provide an internationalized
121      * mapping for a non-Western character set.
122      * 
123      * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
124      * letter is mapped. This implementation contains a default map for US_ENGLISH
125      * 
126      * @param mapping
127      *                  Mapping array to use when finding the corresponding code for a given character
128      */
129     public Soundex(char[] mapping) {
130         this.setSoundexMapping(mapping);
131     }
132 
133     /**
134      * Encodes an Object using the soundex algorithm. This method is provided in order to satisfy the requirements of
135      * the Encoder interface, and will throw an EncoderException if the supplied object is not of type java.lang.String.
136      * 
137      * @param pObject
138      *                  Object to encode
139      * @return An object (or type java.lang.String) containing the soundex code which corresponds to the String
140      *             supplied.
141      * @throws EncoderException
142      *                  if the parameter supplied is not of type java.lang.String
143      * @throws IllegalArgumentException
144      *                  if a character is not mapped
145      */
146     public Object encode(Object pObject) throws EncoderException {
147         if (!(pObject instanceof String)) {
148             throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String");
149         }
150         return soundex((String) pObject);
151     }
152 
153     /**
154      * Encodes a String using the soundex algorithm.
155      * 
156      * @param pString
157      *                  A String object to encode
158      * @return A Soundex code corresponding to the String supplied
159      * @throws IllegalArgumentException
160      *                  if a character is not mapped
161      */
162     public String encode(String pString) {
163         return soundex(pString);
164     }
165 
166     /**
167      * Used internally by the SoundEx algorithm.
168      * 
169      * Consonants from the same code group separated by W or H are treated as one.
170      * 
171      * @param str
172      *                  the cleaned working string to encode (in upper case).
173      * @param index
174      *                  the character position to encode
175      * @return Mapping code for a particular character
176      * @throws IllegalArgumentException
177      *                  if the character is not mapped
178      */
179     private char getMappingCode(String str, int index) {
180         char mappedChar = this.map(str.charAt(index));
181         // HW rule check
182         if (index > 1 && mappedChar != '0') {
183             char hwChar = str.charAt(index - 1);
184             if ('H' == hwChar || 'W' == hwChar) {
185                 char preHWChar = str.charAt(index - 2);
186                 char firstCode = this.map(preHWChar);
187                 if (firstCode == mappedChar || 'H' == preHWChar || 'W' == preHWChar) {
188                     return 0;
189                 }
190             }
191         }
192         return mappedChar;
193     }
194 
195     /**
196      * Returns the maxLength. Standard Soundex
197      * 
198      * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
199      * @return int
200      */
201     public int getMaxLength() {
202         return this.maxLength;
203     }
204 
205     /**
206      * Returns the soundex mapping.
207      * 
208      * @return soundexMapping.
209      */
210     private char[] getSoundexMapping() {
211         return this.soundexMapping;
212     }
213 
214     /**
215      * Maps the given upper-case character to it's Soudex code.
216      * 
217      * @param ch
218      *                  An upper-case character.
219      * @return A Soundex code.
220      * @throws IllegalArgumentException
221      *                  Thrown if <code>ch</code> is not mapped.
222      */
223     private char map(char ch) {
224         int index = ch - 'A';
225         if (index < 0 || index >= this.getSoundexMapping().length) {
226             throw new IllegalArgumentException("The character is not mapped: " + ch);
227         }
228         return this.getSoundexMapping()[index];
229     }
230 
231     /**
232      * Sets the maxLength.
233      * 
234      * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
235      * @param maxLength
236      *                  The maxLength to set
237      */
238     public void setMaxLength(int maxLength) {
239         this.maxLength = maxLength;
240     }
241 
242     /**
243      * Sets the soundexMapping.
244      * 
245      * @param soundexMapping
246      *                  The soundexMapping to set.
247      */
248     private void setSoundexMapping(char[] soundexMapping) {
249         this.soundexMapping = soundexMapping;
250     }
251 
252     /**
253      * Retreives the Soundex code for a given String object.
254      * 
255      * @param str
256      *                  String to encode using the Soundex algorithm
257      * @return A soundex code for the String supplied
258      * @throws IllegalArgumentException
259      *                  if a character is not mapped
260      */
261     public String soundex(String str) {
262         if (str == null) {
263             return null;
264         }
265         str = SoundexUtils.clean(str);
266         if (str.length() == 0) {
267             return str;
268         }
269         char out[] = {'0', '0', '0', '0'};
270         char last, mapped;
271         int incount = 1, count = 1;
272         out[0] = str.charAt(0);
273         last = getMappingCode(str, 0);
274         while ((incount < str.length()) && (count < out.length)) {
275             mapped = getMappingCode(str, incount++);
276             if (mapped != 0) {
277                 if ((mapped != '0') && (mapped != last)) {
278                     out[count++] = mapped;
279                 }
280                 last = mapped;
281             }
282         }
283         return new String(out);
284     }
285 
286 }