个性化阅读
专注于IT技术分析

Soundex算法(函数)在不同编程语言中的实现

本文概述

Soundex是一种语音算法, 用于按声音索引名称(英语发音), 可以将来自不同字符串的SOUNDEX代码进行比较, 以查看说话时字符串听起来的相似程度。

代码的第一个字符是表达式的第一个字符, 转换为大写。该代码的第二个到第四个字符是代表表达式中字母的数字。字母A, E, I, O, U, H, W和Y会被忽略, 除非它们是字符串的第一个字母。所有A-Z范围以外的国际字母字符都被视为元音。因此, 听起来几乎相同的两个弦应该具有相同的soundex弦。例如, 单词” text”和” tixt”都产生” T230″的声音。

让我们开始吧 !

C

#include <stdio.h>

static char code[128] = { 0 };

const char* soundex(const char *s)
{
	static char out[5];
	int c, prev, i;
 
	out[0] = out[4] = 0;
	if (!s || !*s) return out;
 
	out[0] = *s++;
 
	/* first letter, though not coded, can still affect next letter: Pfister */
	prev = code[(int)out[0]];
	for (i = 1; *s && i < 4; s++) {
		if ((c = code[(int)*s]) == prev) continue;
 
		if (c == -1) prev = 0;	/* vowel as separator */
		else if (c > 0) {
			out[i++] = c + '0';
			prev = c;
		}
	}
	while (i < 4) out[i++] = '0';
	return out;
}

void add_code(const char *s, int c)
{
	while (*s) {
		code[(int)*s] = code[0x20 ^ (int)*s] = c;
		s++;
	}
}
 
void init()
{
	static const char *cls[] =
		{ "AEIOU", "", "BFPV", "CGJKQSXZ", "DT", "L", "MN", "R", 0};
	int i;
	for (i = 0; cls[i]; i++)
		add_code(cls[i], i - 1);
}

用法

int main()
{
    init();
    /* J126 */
    printf(soundex("Javascript"));
 
    return 0;
}

C#

using System.Text.RegularExpressions;

public static class Soundex
{
    public static string For(string word)
    {
        const int MaxSoundexCodeLength = 4;

        var soundexCode = new StringBuilder();
        var previousWasHOrW = false;

        word = Regex.Replace(
            word == null ? string.Empty : word.ToUpper(), @"[^\w\s]", string.Empty);

        if (string.IsNullOrEmpty(word))
            return string.Empty.PadRight(MaxSoundexCodeLength, '0');

        soundexCode.Append(word.First());

        for (var i = 1; i < word.Length; i++)
        {
            var numberCharForCurrentLetter =
                GetCharNumberForLetter(word[i]);

            if (i == 1 &&
                    numberCharForCurrentLetter ==
                        GetCharNumberForLetter(soundexCode[0]))
                continue;

            if (soundexCode.Length > 2 && previousWasHOrW &&
                    numberCharForCurrentLetter ==
                        soundexCode[soundexCode.Length - 2])
                continue;

            if (soundexCode.Length > 0 &&
                    numberCharForCurrentLetter ==
                        soundexCode[soundexCode.Length - 1])
                continue;

            soundexCode.Append(numberCharForCurrentLetter);

            previousWasHOrW = "HW".Contains(word[i]);
        }

        return soundexCode
                .Replace("0", string.Empty)
                    .ToString()
                        .PadRight(MaxSoundexCodeLength, '0')
                            .Substring(0, MaxSoundexCodeLength);
    }

    private static char GetCharNumberForLetter(char letter)
    {
        if ("BFPV".Contains(letter)) return '1';
        if ("CGJKQSXZ".Contains(letter)) return '2';
        if ("DT".Contains(letter)) return '3';
        if ('L' == letter) return '4';
        if ("MN".Contains(letter)) return '5';
        if ('R' == letter) return '6';

        return '0';
    }
}

用法

Soundex.For("CSharp Language") == Soundex.For("CSherp Language"); // True as C614 == C614

D

D标准库(Phobos)已包含soundex函数。

import std.stdio: writeln;
import std.string: soundex;
 
void main() {
    assert(soundex("soundex") == "S532");
    assert(soundex("example") == "E251");
    assert(soundex("ciondecks") == "C532");
    assert(soundex("ekzampul") == "E251");
    assert(soundex("Robert") == "R163");
    assert(soundex("Rupert") == "R163");
    assert(soundex("Rubin") == "R150");
    assert(soundex("Ashcraft") == "A261");
    assert(soundex("Ashcroft") == "A261");
    assert(soundex("Tymczak") == "T522");
}

F#

let americanSoundex (x : string) = 
    let toString (xs : char list) = new System.String(xs |> Array.ofList)
    let _americanSoundex =
        let toUpper (x : string) = x.ToUpper()
        let toArray (x : string) = x.ToCharArray()

        let f1 ch = 
            match ch with
            | 'H' | 'W' -> false
            | _ -> true

        let f2 ch =
            match ch with
            | 'B' | 'F' | 'P' | 'V' -> '1'
            | 'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' -> '2'
            | 'D' | 'T' -> '3'
            | 'L' -> '4'        
            | 'M' | 'N' -> '5'
            | 'R' -> '6'
            | _ -> ch

        let rec f3 xs =
            match xs with
            | h0 :: h1 :: t -> h0 :: f3 (if (h0 = h1) then t else (h1 :: t))
            | h :: _ -> [h]
            | _ -> []

        let f4 ch = 
            match ch with
            | 'A' | 'E' | 'I' | 'O' | 'U' | 'Y' -> false
            | _ -> true

        let f5 ch first =
            if ('0' <= ch && ch <= '9') then first
            else ch

        let f6 xs =
            let len = List.length xs
            seq{for i = 0 to 3 - len do yield '0'} 
                |> Seq.append (xs |> Seq.take (System.Math.Min(4, len)))
                |> Seq.toList

        let a = x |> toUpper |> toArray |> Array.toList
        let b = a |> List.filter f1 //1
        let c = b |> List.map f2 //2
        let d = c |> f3 //3
        let e = d |> List.tail |> List.filter f4 //4
        let f = f5 (d |> List.head) (a |> List.head) :: e //5
        f6 f //6
    
    if (x.Length > 0) then toString(_americanSoundex) 
    else "0000"

["Robert"; "Rupert"; "Robbert"; "Rubin"; 
 "Beer"; "Bear"; "Bearer"; 
 "Smith"; "Smyth";
 "Ashcraft"; "Ashcroft";
  "Tymczak"; "Pfister"] 
 |> List.map (fun x -> (x, americanSoundex x)) |> List.iter (fun (x, y) -> printfn "%-8s = %s" x y)

(*

Robert   = R163
Rupert   = R163
Robbert  = R163
Rubin    = R150
Beer     = B600
Bear     = B600
Bearer   = B660
Smith    = S530
Smyth    = S530
Ashcraft = A261
Ashcroft = A261
Tymczak  = T522
Pfister  = P236

*)

Go

package myPackageName

import (
	"bytes"
	"strings"
	"fmt"
)

const codeLen = 4

var codes = map[string]string{
	"a": "", "b": "1", "c": "2", "d": "3", "e": "", "f": "1", "g": "2", "h": "", "i": "", "j": "2", "k": "2", "l": "4", "m": "5", "n": "5", "o": "", "p": "1", "q": "2", "r": "6", "s": "2", "t": "3", "u": "", "v": "1", "w": "", "x": "2", "y": "", "z": "2", }

func Soundex(s string) string {
	var encoded bytes.Buffer
	encoded.WriteByte(s[0])

	for i := 1; i < len(s); i++ {
		if encoded.Len() == codeLen {
			break
		}

		previous, current := strings.ToLower(string(s[i-1])), strings.ToLower(string(s[i]))

		var next string
		if i+1 < len(s) {
			next = strings.ToLower(string(s[i+1]))
		}

		if (current == "h" || current == "w") && (codes[previous] == codes[next]) {
			i = i + 1
			continue
		}

		if c, ok := codes[current]; ok && len(c) > 0 {
			encoded.WriteByte(c[0])
		}

		if codes[current] == codes[next] {
			i = i + 1
			continue
		}
	}

	if encoded.Len() < codeLen {
		padding := strings.Repeat("0", codeLen-encoded.Len())
		encoded.WriteString(padding)
	}

	return strings.ToUpper(encoded.String())
}

用法

func main() {
        /* J126 */
 	fmt.Println(Soundex("Javascript"))
}

Java

private static String getCode(char c){
  switch(c){
    case 'B': case 'F': case 'P': case 'V':
      return "1";
    case 'C': case 'G': case 'J': case 'K':
    case 'Q': case 'S': case 'X': case 'Z':
      return "2";
    case 'D': case 'T':
      return "3";
    case 'L':
      return "4";
    case 'M': case 'N':
      return "5";
    case 'R':
      return "6";
    default:
      return "";
  }
}
 
public static String soundex(String s){
  String code, previous, soundex;
  code = s.toUpperCase().charAt(0) + "";
  previous = "7";
  for(int i = 1;i < s.length();i++){
    String current = getCode(s.toUpperCase().charAt(i));
    if(current.length() > 0 && !current.equals(previous)){
      code = code + current;
    }
    previous = current;
  }
  soundex = (code + "0000").substring(0, 4);
  return soundex;
}

用法

public static void main(String[] args){
    System.out.println(soundex("Soundex"));//S532
    System.out.println(soundex("Example"));//E251
    System.out.println(soundex("Sownteks"));//S532
    System.out.println(soundex("Ekzampul"));//E251
}

JavaScript

var soundex = function(s) {
    var a = s.toLowerCase().split(''), f = a.shift(), r = '', codes = { a: '', e: '', i: '', o: '', u: '', b: 1, f: 1, p: 1, v: 1, c: 2, g: 2, j: 2, k: 2, q: 2, s: 2, x: 2, z: 2, d: 3, t: 3, l: 4, m: 5, n: 5, r: 6 };

    r = f +
        a
        .map(function(v, i, a) {
            return codes[v]
        })
        .filter(function(v, i, a) {
            return ((i === 0) ? v !== codes[f] : v !== a[i - 1]);
        })
        .join('');

    return (r + '000').slice(0, 4).toUpperCase();
};

用法

soundex("Javascript") == soundex("Jabascript"); // True as J126 == J126

Objective-C

你可以在Darkseed编写的github gist中找到Soundex算法Objective-C的实现。

PHP

PHP已经将soundex作为内置函数来计算字符串的soundex键。

用法

soundex("PHP Server Language") == soundex("PHP Serber language"); // True as P100 == P100 

python

函数

def get_soundex(name):
	"""Get the soundex code for the string"""
	name = name.upper()

	soundex = ""
	soundex += name[0]

	dictionary = {"BFPV": "1", "CGJKQSXZ":"2", "DT":"3", "L":"4", "MN":"5", "R":"6", "AEIOUHWY":"."}

	for char in name[1:]:
		for key in dictionary.keys():
			if char in key:
				code = dictionary[key]
				if code != soundex[-1]:
					soundex += code

	soundex = soundex.replace(".", "")
	soundex = soundex[:4].ljust(4, "0")

	return soundex

用法

    list = ["Smith", "Smythe", "Robert", "Rupert", "Schultz", "Shultz"]

	print("NAME\t\tSOUNDEX")
	for name in list:
		print("%s\t\t%s" % (name, get_soundex(name)))

图书馆

如果你更喜欢使用库, 则可以使用模糊包(使用C扩展(通过Pyrex)来提高速度)。

Ruby

class String
 
  SoundexChars = 'BFPVCGJKQSXZDTLMNR'
  SoundexNums  = '111122222222334556'
  SoundexCharsEx = '^' + SoundexChars
  SoundexCharsDel = '^A-Z'
 
  # desc: http://en.wikipedia.org/wiki/Soundex
  def soundex(census = true)
    str = self.upcase.delete(SoundexCharsDel)
    str[0, 1] + str[1..-1].delete(SoundexCharsEx).
                          tr_s(SoundexChars, SoundexNums)\
                          [0 .. (census ? 2 : -1)].
                          ljust(3, '0') rescue ''
  end
 
  def sounds_like(other)
    self.soundex == other.soundex
  end
end

用法

%w(Soundex Sownteks Example Ekzampul foo bar).each_slice(2) do |word1, word2|
  [word1, word2].each {|word| puts '%-8s -> %s' % [word, word.soundex]}
 
  print "'#{word1}' "
  print word1.sounds_like(word2) ? "sounds" : "does not sound"
  print " like '#{word2}'\n"
end

#Soundex  -> S532
#Sownteks -> S532
#'Soundex' sounds like 'Sownteks'
#Example  -> E251
#Ekzampul -> E251
#'Example' sounds like 'Ekzampul'
#foo      -> F000
#bar      -> B600
#'foo' does not sound like 'bar'

Scala

def soundex(s:String)={
   var code=s.head.toUpper.toString
   var previous=getCode(code.head)
   for(ch <- s.drop(1); current=getCode(ch.toUpper)){
      if (!current.isEmpty && current!=previous)
         code+=current
      previous=current
   }
   code+="0000"
   code.slice(0, 4)
}
 
def getCode(c:Char)={
   val code=Map("1"->List('B', 'F', 'P', 'V'), "2"->List('C', 'G', 'J', 'K', 'Q', 'S', 'X', 'Z'), "3"->List('D', 'T'), "4"->List('L'), "5"->List('M', 'N'), "6"->List('R'))
 
   code.find(_._2.exists(_==c)) match {
      case Some((k, _)) => k
      case _ => ""
   }
}

用法

def main(args: Array[String]): Unit = {
   val tests=Map(
      "Soundex"     -> "S532", "Euler"	    -> "E460", "Gauss"	    -> "G200", "Hilbert"	    -> "H416", "Knuth"	    -> "K530", "Lloyd"	    -> "L300", "Lukasiewicz" -> "L222", "Ellery"	    -> "E460", "Ghosh"	    -> "G200", "Heilbronn"   -> "H416", "Kant"	    -> "K530", "Ladd"	    -> "L300", "Lissajous"   -> "L222", "Wheaton"	    -> "W350", "Ashcraft"    -> "A226", "Burroughs"   -> "B622", "Burrows"	    -> "B620", "O'Hara"	    -> "O600")
 
   tests.foreach{(v)=>
      val code=soundex(v._1)
      val status=if (code==v._2) "OK" else "ERROR"
      printf("Name: %-20s  Code: %s   Found: %s  - %s\n", v._1, v._2, code, status)
   }
}

Swift

在这个github仓库中, cafford编写的类是Swift语言中原始Soundex算法的实现。

//
//  Soundex.swift
//  speller
//
//  Created by Clifford Helsel on 4/28/16.
//
//  Based on standard Soundex algorithm and loosely ported from Apache Commons
//  https://commons.apache.org/proper/commons-codec/apidocs/src-html/org/apache/commons/codec/language/Soundex.html


public class Soundex {
    
    private static let en_mapping_string = Array("01230120022455012623010202".characters)
    private static let en_alphabet = Array("ABCDEFGHIJKLMNOPQRSTUVWXYZ".characters)
    private let mapping: [Character:Character] = Soundex.buildMapping(codes:en_alphabet, alphabet:en_mapping_string)
    
    private static func buildMapping(codes: Array<Character>, alphabet: Array<Character>) -> [Character:Character] {
        var retval: [Character:Character] = [:]
        for (index, code) in codes.enumerated() {
            retval[code] = alphabet[index]
        }
        return retval
    }
    
    private var soundexMapping: Array<Character> = Array(repeating:" ", count:4)
    
    private func getMappingCode(s: String, index:Int) -> Character {
        let i = s.index(s.startIndex, offsetBy: index)
        
        let mappedChar = mapChar(c:s[i])
        
        if (index>1 && !(mappedChar=="0"))
        {
            let j = s.index(s.startIndex, offsetBy:index-1)
            
            let hwChar = s[j]
            
            if (hwChar=="H" || hwChar=="W")
            {
                let k = s.index(s.startIndex, offsetBy:index-2)
                let prehwChar = s[k]
                let firstCode = mapChar(c:prehwChar)
                if (firstCode==mappedChar || "H"==prehwChar || "W"==prehwChar) {
                    return "0"
                }
            }
        }
        
        return mappedChar
    }
    
    private func mapChar(c: Character) -> Character {
        if let val = mapping[c] {
            return val
        }
        return "0" // not specified in original Soundex specification, if character is not found, code is 0
    }
    
    public func soundex(of: String) -> String {
        
        guard (of.characters.count>0) else {
            return ""
        }
        
        let str=of.uppercased()
        
        var out: Array<Character> = Array("    ".characters)
        var last: Character = " "
        var mapped: Character = " "
        var incount=1
        var count = 1

        out[0]=str[str.startIndex]
        last = getMappingCode(s:str, index: 0)
        while (incount < str.characters.count && count < out.count) {
            mapped = getMappingCode(s:str, index: incount)
            incount += 1
            if (mapped != "0") {
                if (mapped != "0" && mapped != last) {
                    out[count]=mapped
                    count += 1
                }
            }
        }
        return String(out)
    }
}

用法

let c = Soundex()

c.soundex(of:"Christopher") // C631

VBScript

Function getCode(c)
    Select Case c
        Case "B", "F", "P", "V"
            getCode = "1"
        Case "C", "G", "J", "K", "Q", "S", "X", "Z"
            getCode = "2"
        Case "D", "T"
            getCode = "3"
        Case "L"
            getCode = "4"
        Case "M", "N"
            getCode = "5"
        Case "R"
            getCode = "6"
    End Select
End Function
 
Function soundex(s)
    Dim code, previous
    code = UCase(Mid(s, 1, 1))
    previous = 7
    For i = 2 to (Len(s) + 1)
        current = getCode(UCase(Mid(s, i, 1)))
        If Len(current) > 0 And current <> previous Then
            code = code & current
        End If
        previous = current
    Next
    soundex = Mid(code, 1, 4)
    If Len(code) < 4 Then
        soundex = soundex & String(4 - Len(code), "0")
    End If
End Function

最后, 如果你知道Soundex算法在另一种语言中的实现(或者你对现有语言有更好的摘录), 请不要害羞, 并在评论框中与我们分享, 祝你玩得开心!

赞(0)
未经允许不得转载:srcmini » Soundex算法(函数)在不同编程语言中的实现

评论 抢沙发

评论前必须登录!