View Javadoc
1   /**
2    * Copyright 2014 Internet2
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *   http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  /*
17   * Copyright 2001-2004 The Apache Software Foundation.
18   * 
19   * Licensed under the Apache License, Version 2.0 (the "License");
20   * you may not use this file except in compliance with the License.
21   * You may obtain a copy of the License at
22   * 
23   *      http://www.apache.org/licenses/LICENSE-2.0
24   * 
25   * Unless required by applicable law or agreed to in writing, software
26   * distributed under the License is distributed on an "AS IS" BASIS,
27   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
28   * See the License for the specific language governing permissions and
29   * limitations under the License.
30   */ 
31  
32  package edu.internet2.middleware.grouperClientExt.org.apache.commons.codec.language;
33  
34  import edu.internet2.middleware.grouperClientExt.org.apache.commons.codec.EncoderException;
35  import edu.internet2.middleware.grouperClientExt.org.apache.commons.codec.StringEncoder;
36  
37  /**
38   * Encodes a string into a Refined Soundex value. A refined soundex code is
39   * optimized for spell checking words. Soundex method originally developed by
40   * <CITE>Margaret Odell</CITE> and <CITE>Robert Russell</CITE>.
41   * 
42   * @author Apache Software Foundation
43   * @version $Id: RefinedSoundex.java,v 1.1 2008-11-30 10:57:28 mchyzer Exp $
44   */
45  public class RefinedSoundex implements StringEncoder {
46  
47      /**
48  	 * This static variable contains an instance of the RefinedSoundex using
49  	 * the US_ENGLISH mapping.
50  	 */
51      public static final RefinedSoundexgrouperClientExt/org/apache/commons/codec/language/RefinedSoundex.html#RefinedSoundex">RefinedSoundex US_ENGLISH = new RefinedSoundex();
52  
53      /**
54  	 * RefinedSoundex is *refined* for a number of reasons one being that the
55  	 * mappings have been altered. This implementation contains default
56  	 * mappings for US English.
57  	 */
58      public static final char[] US_ENGLISH_MAPPING = "01360240043788015936020505".toCharArray();
59  
60      /**
61  	 * Every letter of the alphabet is "mapped" to a numerical value. This char
62  	 * array holds the values to which each letter is mapped. This
63  	 * implementation contains a default map for US_ENGLISH
64  	 */
65      private char[] soundexMapping;
66  
67      /**
68  	 * Creates an instance of the RefinedSoundex object using the default US
69  	 * English mapping.
70  	 */
71      public RefinedSoundex() {
72          this(US_ENGLISH_MAPPING);
73      }
74  
75      /**
76  	 * Creates a refined soundex instance using a custom mapping. This
77  	 * constructor can be used to customize the mapping, and/or possibly
78  	 * provide an internationalized mapping for a non-Western character set.
79  	 * 
80  	 * @param mapping
81  	 *                  Mapping array to use when finding the corresponding code for
82  	 *                  a given character
83  	 */
84      public RefinedSoundex(char[] mapping) {
85          this.soundexMapping = mapping;
86      }
87  
88      /**
89  	 * Returns the number of characters in the two encoded Strings that are the
90  	 * same. This return value ranges from 0 to the length of the shortest
91  	 * encoded String: 0 indicates little or no similarity, and 4 out of 4 (for
92  	 * example) indicates strong similarity or identical values. For refined
93  	 * Soundex, the return value can be greater than 4.
94  	 * 
95  	 * @param s1
96  	 *                  A String that will be encoded and compared.
97  	 * @param s2
98  	 *                  A String that will be encoded and compared.
99  	 * @return The number of characters in the two encoded Strings that are the
100 	 *             same from 0 to to the length of the shortest encoded String.
101 	 * 
102 	 * @see SoundexUtils#difference(StringEncoder,String,String)
103 	 * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
104 	 *          MS T-SQL DIFFERENCE</a>
105 	 * 
106 	 * @throws EncoderException
107 	 *                  if an error occurs encoding one of the strings
108      * @since 1.3
109 	 */
110     public int difference(String s1, String s2) throws EncoderException {
111         return SoundexUtils.difference(this, s1, s2);
112     }
113 
114     /**
115 	 * Encodes an Object using the refined soundex algorithm. This method is
116 	 * provided in order to satisfy the requirements of the Encoder interface,
117 	 * and will throw an EncoderException if the supplied object is not of type
118 	 * java.lang.String.
119 	 * 
120 	 * @param pObject
121 	 *                  Object to encode
122 	 * @return An object (or type java.lang.String) containing the refined
123 	 *             soundex code which corresponds to the String supplied.
124 	 * @throws EncoderException
125 	 *                  if the parameter supplied is not of type java.lang.String
126 	 */
127     public Object encode(Object pObject) throws EncoderException {
128         if (!(pObject instanceof java.lang.String)) {
129             throw new EncoderException("Parameter supplied to RefinedSoundex encode is not of type java.lang.String");
130         }
131         return soundex((String) pObject);
132     }
133 
134     /**
135 	 * Encodes a String using the refined soundex algorithm.
136 	 * 
137 	 * @param pString
138 	 *                  A String object to encode
139 	 * @return A Soundex code corresponding to the String supplied
140 	 */
141     public String encode(String pString) {
142         return soundex(pString);
143     }
144 
145     /**
146 	 * Returns the mapping code for a given character. The mapping codes are
147 	 * maintained in an internal char array named soundexMapping, and the
148 	 * default values of these mappings are US English.
149 	 * 
150 	 * @param c
151 	 *                  char to get mapping for
152 	 * @return A character (really a numeral) to return for the given char
153 	 */
154     char getMappingCode(char c) {
155         if (!Character.isLetter(c)) {
156             return 0;
157         }
158         return this.soundexMapping[Character.toUpperCase(c) - 'A'];
159     }
160 
161     /**
162 	 * Retreives the Refined Soundex code for a given String object.
163 	 * 
164 	 * @param str
165 	 *                  String to encode using the Refined Soundex algorithm
166 	 * @return A soundex code for the String supplied
167 	 */
168     public String soundex(String str) {
169         if (str == null) {
170             return null;
171         }
172         str = SoundexUtils.clean(str);
173         if (str.length() == 0) {
174             return str;
175         }
176 
177         StringBuffer sBuf = new StringBuffer();
178         sBuf.append(str.charAt(0));
179 
180         char last, current;
181         last = '*';
182 
183         for (int i = 0; i < str.length(); i++) {
184 
185             current = getMappingCode(str.charAt(i));
186             if (current == last) {
187                 continue;
188             } else if (current != 0) {
189                 sBuf.append(current);
190             }
191 
192             last = current;
193 
194         }
195 
196         return sBuf.toString();
197     }
198 }