View Javadoc
1   /*******************************************************************************
2    * Copyright 2012 Internet2
3    * 
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * 
8    *   http://www.apache.org/licenses/LICENSE-2.0
9    * 
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   ******************************************************************************/
16  /*
17   * Copyright 2001-2004 The Apache Software Foundation.
18   * 
19   * Licensed under the Apache License, Version 2.0 (the "License");
20   * you may not use this file except in compliance with the License.
21   * You may obtain a copy of the License at
22   * 
23   *      http://www.apache.org/licenses/LICENSE-2.0
24   * 
25   * Unless required by applicable law or agreed to in writing, software
26   * distributed under the License is distributed on an "AS IS" BASIS,
27   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
28   * See the License for the specific language governing permissions and
29   * limitations under the License.
30   */ 
31  
32  package edu.internet2.middleware.grouperInstallerExt.org.apache.commons.codec.language;
33  
34  import edu.internet2.middleware.grouperInstallerExt.org.apache.commons.codec.EncoderException;
35  import edu.internet2.middleware.grouperInstallerExt.org.apache.commons.codec.StringEncoder;
36  
37  /**
38   * Encodes a string into a metaphone value. 
39   * <p>
40   * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>. 
41   * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
42   * </p>
43   * <p>
44   * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, p
45   * 39.</CITE>
46   * </p>
47   * 
48   * @author Apache Software Foundation
49   * @version $Id: Metaphone.java,v 1.1 2008-11-30 10:57:28 mchyzer Exp $
50   */
51  public class Metaphone implements StringEncoder {
52  
53      /**
54       * Five values in the English language 
55       */
56      private String vowels = "AEIOU" ;
57  
58      /**
59       * Variable used in Metaphone algorithm
60       */
61      private String frontv = "EIY"   ;
62  
63      /**
64       * Variable used in Metaphone algorithm
65       */
66      private String varson = "CSPTG" ;
67  
68      /**
69       * The max code length for metaphone is 4
70       */
71      private int maxCodeLen = 4 ;
72  
73      /**
74       * Creates an instance of the Metaphone encoder
75       */
76      public Metaphone() {
77          super();
78      }
79  
80      /**
81       * Find the metaphone value of a String. This is similar to the
82       * soundex algorithm, but better at finding similar sounding words.
83       * All input is converted to upper case.
84       * Limitations: Input format is expected to be a single ASCII word
85       * with only characters in the A - Z range, no punctuation or numbers.
86       *
87       * @param txt String to find the metaphone code for
88       * @return A metaphone code corresponding to the String supplied
89       */
90      public String metaphone(String txt) {
91          boolean hard = false ;
92          if ((txt == null) || (txt.length() == 0)) {
93              return "" ;
94          }
95          // single character is itself
96          if (txt.length() == 1) {
97              return txt.toUpperCase() ;
98          }
99        
100         char[] inwd = txt.toUpperCase().toCharArray() ;
101       
102         StringBuffer local = new StringBuffer(40); // manipulate
103         StringBuffer code = new StringBuffer(10) ; //   output
104         // handle initial 2 characters exceptions
105         switch(inwd[0]) {
106         case 'K' : 
107         case 'G' : 
108         case 'P' : /* looking for KN, etc*/
109             if (inwd[1] == 'N') {
110                 local.append(inwd, 1, inwd.length - 1);
111             } else {
112                 local.append(inwd);
113             }
114             break;
115         case 'A': /* looking for AE */
116             if (inwd[1] == 'E') {
117                 local.append(inwd, 1, inwd.length - 1);
118             } else {
119                 local.append(inwd);
120             }
121             break;
122         case 'W' : /* looking for WR or WH */
123             if (inwd[1] == 'R') {   // WR -> R
124                 local.append(inwd, 1, inwd.length - 1); 
125                 break ;
126             }
127             if (inwd[1] == 'H') {
128                 local.append(inwd, 1, inwd.length - 1);
129                 local.setCharAt(0, 'W'); // WH -> W
130             } else {
131                 local.append(inwd);
132             }
133             break;
134         case 'X' : /* initial X becomes S */
135             inwd[0] = 'S';
136             local.append(inwd);
137             break ;
138         default :
139             local.append(inwd);
140         } // now local has working string with initials fixed
141 
142         int wdsz = local.length();
143         int n = 0 ;
144 
145         while ((code.length() < this.getMaxCodeLen()) && 
146         	   (n < wdsz) ) { // max code size of 4 works well
147             char symb = local.charAt(n) ;
148             // remove duplicate letters except C
149             if ((symb != 'C') && (isPreviousChar( local, n, symb )) ) {
150                 n++ ;
151             } else { // not dup
152                 switch(symb) {
153                 case 'A' : case 'E' : case 'I' : case 'O' : case 'U' :
154                     if (n == 0) { 
155                         code.append(symb);
156                     }
157                     break ; // only use vowel if leading char
158                 case 'B' :
159                     if ( isPreviousChar(local, n, 'M') && 
160                          isLastChar(wdsz, n) ) { // B is silent if word ends in MB
161 						break;
162                     }
163                     code.append(symb);
164                     break;
165                 case 'C' : // lots of C special cases
166                     /* discard if SCI, SCE or SCY */
167                     if ( isPreviousChar(local, n, 'S') && 
168                          !isLastChar(wdsz, n) && 
169                          (this.frontv.indexOf(local.charAt(n + 1)) >= 0) ) { 
170                         break;
171                     }
172                     if (regionMatch(local, n, "CIA")) { // "CIA" -> X
173                         code.append('X'); 
174                         break;
175                     }
176                     if (!isLastChar(wdsz, n) && 
177                         (this.frontv.indexOf(local.charAt(n + 1)) >= 0)) {
178                         code.append('S');
179                         break; // CI,CE,CY -> S
180                     }
181                     if (isPreviousChar(local, n, 'S') &&
182 						isNextChar(local, n, 'H') ) { // SCH->sk
183                         code.append('K') ; 
184                         break ;
185                     }
186                     if (isNextChar(local, n, 'H')) { // detect CH
187                         if ((n == 0) && 
188                         	(wdsz >= 3) && 
189                             isVowel(local,2) ) { // CH consonant -> K consonant
190                             code.append('K');
191                         } else { 
192                             code.append('X'); // CHvowel -> X
193                         }
194                     } else { 
195                         code.append('K');
196                     }
197                     break ;
198                 case 'D' :
199                     if (!isLastChar(wdsz, n + 1) && 
200                         isNextChar(local, n, 'G') && 
201                         (this.frontv.indexOf(local.charAt(n + 2)) >= 0)) { // DGE DGI DGY -> J 
202                         code.append('J'); n += 2 ;
203                     } else { 
204                         code.append('T');
205                     }
206                     break ;
207                 case 'G' : // GH silent at end or before consonant
208                     if (isLastChar(wdsz, n + 1) && 
209                         isNextChar(local, n, 'H')) {
210                         break;
211                     }
212                     if (!isLastChar(wdsz, n + 1) &&  
213                         isNextChar(local,n,'H') && 
214                         !isVowel(local,n+2)) {
215                         break;
216                     }
217                     if ((n > 0) && 
218                     	( regionMatch(local, n, "GN") ||
219 					      regionMatch(local, n, "GNED") ) ) {
220                         break; // silent G
221                     }
222                     if (isPreviousChar(local, n, 'G')) {
223                         hard = true ;
224                     } else {
225                         hard = false ;
226                     }
227                     if (!isLastChar(wdsz, n) && 
228                         (this.frontv.indexOf(local.charAt(n + 1)) >= 0) && 
229                         (!hard)) {
230                         code.append('J');
231                     } else {
232                         code.append('K');
233                     }
234                     break ;
235                 case 'H':
236                     if (isLastChar(wdsz, n)) {
237                         break ; // terminal H
238                     }
239                     if ((n > 0) && 
240                         (this.varson.indexOf(local.charAt(n - 1)) >= 0)) {
241                         break;
242                     }
243                     if (isVowel(local,n+1)) {
244                         code.append('H'); // Hvowel
245                     }
246                     break;
247                 case 'F': 
248                 case 'J' : 
249                 case 'L' :
250                 case 'M': 
251                 case 'N' : 
252                 case 'R' :
253                     code.append(symb); 
254                     break;
255                 case 'K' :
256                     if (n > 0) { // not initial
257                         if (!isPreviousChar(local, n, 'C')) {
258                             code.append(symb);
259                         }
260                     } else {
261                         code.append(symb); // initial K
262                     }
263                     break ;
264                 case 'P' :
265                     if (isNextChar(local,n,'H')) {
266                         // PH -> F
267                         code.append('F');
268                     } else {
269                         code.append(symb);
270                     }
271                     break ;
272                 case 'Q' :
273                     code.append('K');
274                     break;
275                 case 'S' :
276                     if (regionMatch(local,n,"SH") || 
277 					    regionMatch(local,n,"SIO") || 
278 					    regionMatch(local,n,"SIA")) {
279                         code.append('X');
280                     } else {
281                         code.append('S');
282                     }
283                     break;
284                 case 'T' :
285                     if (regionMatch(local,n,"TIA") || 
286 						regionMatch(local,n,"TIO")) {
287                         code.append('X'); 
288                         break;
289                     }
290                     if (regionMatch(local,n,"TCH")) {
291 						// Silent if in "TCH"
292                         break;
293                     }
294                     // substitute numeral 0 for TH (resembles theta after all)
295                     if (regionMatch(local,n,"TH")) {
296                         code.append('0');
297                     } else {
298                         code.append('T');
299                     }
300                     break ;
301                 case 'V' :
302                     code.append('F'); break ;
303                 case 'W' : case 'Y' : // silent if not followed by vowel
304                     if (!isLastChar(wdsz,n) && 
305                     	isVowel(local,n+1)) {
306                         code.append(symb);
307                     }
308                     break ;
309                 case 'X' :
310                     code.append('K'); code.append('S');
311                     break ;
312                 case 'Z' :
313                     code.append('S'); break ;
314                 } // end switch
315                 n++ ;
316             } // end else from symb != 'C'
317             if (code.length() > this.getMaxCodeLen()) { 
318             	code.setLength(this.getMaxCodeLen()); 
319             }
320         }
321         return code.toString();
322     }
323 
324 	private boolean isVowel(StringBuffer string, int index) {
325 		return (this.vowels.indexOf(string.charAt(index)) >= 0);
326 	}
327 
328 	private boolean isPreviousChar(StringBuffer string, int index, char c) {
329 		boolean matches = false;
330 		if( index > 0 &&
331 		    index < string.length() ) {
332 			matches = string.charAt(index - 1) == c;
333 		}
334 		return matches;
335 	}
336 
337 	private boolean isNextChar(StringBuffer string, int index, char c) {
338 		boolean matches = false;
339 		if( index >= 0 &&
340 		    index < string.length() - 1 ) {
341 			matches = string.charAt(index + 1) == c;
342 		}
343 		return matches;
344 	}
345 
346 	private boolean regionMatch(StringBuffer string, int index, String test) {
347 		boolean matches = false;
348 		if( index >= 0 &&
349 		    (index + test.length() - 1) < string.length() ) {
350 			String substring = string.substring( index, index + test.length());
351 			matches = substring.equals( test );
352 		}
353 		return matches;
354 	}
355 
356 	private boolean isLastChar(int wdsz, int n) {
357 		return n + 1 == wdsz;
358 	} 
359     
360     
361     /**
362      * Encodes an Object using the metaphone algorithm.  This method
363      * is provided in order to satisfy the requirements of the
364      * Encoder interface, and will throw an EncoderException if the
365      * supplied object is not of type java.lang.String.
366      *
367      * @param pObject Object to encode
368      * @return An object (or type java.lang.String) containing the 
369      *         metaphone code which corresponds to the String supplied.
370      * @throws EncoderException if the parameter supplied is not
371      *                          of type java.lang.String
372      */
373     public Object encode(Object pObject) throws EncoderException {
374         if (!(pObject instanceof java.lang.String)) {
375             throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String"); 
376         }
377         return metaphone((String) pObject);
378     }
379 
380     /**
381      * Encodes a String using the Metaphone algorithm. 
382      *
383      * @param pString String object to encode
384      * @return The metaphone code corresponding to the String supplied
385      */
386     public String encode(String pString) {
387         return metaphone(pString);   
388     }
389 
390     /**
391      * Tests is the metaphones of two strings are identical.
392      *
393      * @param str1 First of two strings to compare
394      * @param str2 Second of two strings to compare
395      * @return true if the metaphones of these strings are identical, 
396      *         false otherwise.
397      */
398     public boolean isMetaphoneEqual(String str1, String str2) {
399         return metaphone(str1).equals(metaphone(str2));
400     }
401 
402     /**
403      * Returns the maxCodeLen.
404      * @return int
405      */
406     public int getMaxCodeLen() { return this.maxCodeLen; }
407 
408     /**
409      * Sets the maxCodeLen.
410      * @param maxCodeLen The maxCodeLen to set
411      */
412     public void setMaxCodeLen(int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
413 
414 }