@@ -918,4 +918,219 @@ PHP_FUNCTION(grapheme_str_split)
918
918
ubrk_close (bi );
919
919
}
920
920
921
+ PHP_FUNCTION (grapheme_levenshtein )
922
+ {
923
+ zend_string * string1 , * string2 ;
924
+ zend_long cost_ins = 1 ;
925
+ zend_long cost_rep = 1 ;
926
+ zend_long cost_del = 1 ;
927
+
928
+ ZEND_PARSE_PARAMETERS_START (2 , 5 )
929
+ Z_PARAM_STR (string1 )
930
+ Z_PARAM_STR (string2 )
931
+ Z_PARAM_OPTIONAL
932
+ Z_PARAM_LONG (cost_ins )
933
+ Z_PARAM_LONG (cost_rep )
934
+ Z_PARAM_LONG (cost_del )
935
+ ZEND_PARSE_PARAMETERS_END ();
936
+
937
+ if (cost_ins <= 0 || cost_ins > UINT_MAX / 4 ) {
938
+ zend_argument_value_error (3 , "must be greater than 0 and less than or equal to %d" , UINT_MAX / 4 );
939
+ RETURN_THROWS ();
940
+ }
941
+
942
+ if (cost_rep <= 0 || cost_rep > UINT_MAX / 4 ) {
943
+ zend_argument_value_error (4 , "must be greater than 0 and less than or equal to %d" , UINT_MAX / 4 );
944
+ RETURN_THROWS ();
945
+ }
946
+
947
+ if (cost_del <= 0 || cost_del > UINT_MAX / 4 ) {
948
+ zend_argument_value_error (5 , "must be greater than 0 and less than or equal to %d" , UINT_MAX / 4 );
949
+ RETURN_THROWS ();
950
+ }
951
+
952
+ zend_long c0 , c1 , c2 ;
953
+ zend_long retval ;
954
+ size_t i2 ;
955
+ char * pstr1 , * pstr2 ;
956
+
957
+ UChar * ustring1 = NULL ;
958
+ UChar * ustring2 = NULL ;
959
+
960
+ int32_t ustring1_len = 0 ;
961
+ int32_t ustring2_len = 0 ;
962
+
963
+ UErrorCode ustatus = U_ZERO_ERROR ;
964
+
965
+ /* When all costs are equal, levenshtein fulfills the requirements of a metric, which means
966
+ * that the distance is symmetric. If string1 is shorter than string2 we can save memory (and CPU time)
967
+ * by having shorter rows (p1 & p2). */
968
+ if (ZSTR_LEN (string1 ) < ZSTR_LEN (string2 ) && cost_ins == cost_rep && cost_rep == cost_del ) {
969
+ zend_string * tmp = string1 ;
970
+ string1 = string2 ;
971
+ string2 = tmp ;
972
+ }
973
+
974
+ pstr1 = ZSTR_VAL (string1 );
975
+ pstr2 = ZSTR_VAL (string2 );
976
+
977
+ intl_convert_utf8_to_utf16 (& ustring1 , & ustring1_len , pstr1 , ZSTR_LEN (string1 ), & ustatus );
978
+
979
+ if (U_FAILURE (ustatus )) {
980
+ intl_error_set_code (NULL , ustatus );
981
+
982
+ intl_error_set_custom_msg (NULL , "Error converting input string to UTF-16" , 0 );
983
+ efree (ustring1 );
984
+ RETURN_FALSE ;
985
+ }
986
+
987
+ intl_convert_utf8_to_utf16 (& ustring2 , & ustring2_len , pstr2 , ZSTR_LEN (string2 ), & ustatus );
988
+
989
+ if (U_FAILURE (ustatus )) {
990
+ intl_error_set_code (NULL , ustatus );
991
+
992
+ intl_error_set_custom_msg (NULL , "Error converting input string to UTF-16" , 0 );
993
+ efree (ustring2 );
994
+ efree (ustring1 );
995
+ RETURN_FALSE ;
996
+ }
997
+
998
+ UBreakIterator * bi1 , * bi2 ;
999
+
1000
+ int32_t strlen_1 , strlen_2 ;
1001
+ strlen_1 = grapheme_split_string (ustring1 , ustring1_len , NULL , 0 );
1002
+ strlen_2 = grapheme_split_string (ustring2 , ustring2_len , NULL , 0 );
1003
+
1004
+ if (strlen_1 == 0 ) {
1005
+ efree (ustring1 );
1006
+ efree (ustring2 );
1007
+ RETURN_LONG (strlen_2 * cost_ins );
1008
+ }
1009
+ if (strlen_2 == 0 ) {
1010
+ efree (ustring1 );
1011
+ efree (ustring2 );
1012
+ RETURN_LONG (strlen_1 * cost_del );
1013
+ }
1014
+
1015
+ unsigned char u_break_iterator_buffer1 [U_BRK_SAFECLONE_BUFFERSIZE ];
1016
+ unsigned char u_break_iterator_buffer2 [U_BRK_SAFECLONE_BUFFERSIZE ];
1017
+ bi1 = grapheme_get_break_iterator (u_break_iterator_buffer1 , & ustatus );
1018
+ if (U_FAILURE (ustatus )) {
1019
+ intl_error_set_code (NULL , ustatus );
1020
+ intl_error_set_custom_msg (NULL , "Error on grapheme_get_break_iterator for argument #1 ($string1)" , 0 );
1021
+ efree (ustring2 );
1022
+ efree (ustring1 );
1023
+ ubrk_close (bi1 );
1024
+ RETURN_FALSE ;
1025
+ }
1026
+
1027
+ bi2 = grapheme_get_break_iterator (u_break_iterator_buffer2 , & ustatus );
1028
+ if (U_FAILURE (ustatus )) {
1029
+ intl_error_set_code (NULL , ustatus );
1030
+ intl_error_set_custom_msg (NULL , "Error on grapheme_get_break_iterator for argument #2 ($string2)" , 0 );
1031
+ efree (ustring2 );
1032
+ efree (ustring1 );
1033
+ ubrk_close (bi2 );
1034
+ ubrk_close (bi1 );
1035
+ RETURN_FALSE ;
1036
+ }
1037
+ ubrk_setText (bi1 , ustring1 , ustring1_len , & ustatus );
1038
+
1039
+ if (U_FAILURE (ustatus )) {
1040
+ intl_error_set_code (NULL , ustatus );
1041
+
1042
+ intl_error_set_custom_msg (NULL , "Error on ubrk_setText for argument #1 ($string1)" , 0 );
1043
+ efree (ustring2 );
1044
+ efree (ustring1 );
1045
+ ubrk_close (bi2 );
1046
+ ubrk_close (bi1 );
1047
+ RETURN_FALSE ;
1048
+ }
1049
+
1050
+ ubrk_setText (bi2 , ustring2 , ustring2_len , & ustatus );
1051
+ if (U_FAILURE (ustatus )) {
1052
+ intl_error_set_code (NULL , ustatus );
1053
+
1054
+ intl_error_set_custom_msg (NULL , "Error on ubrk_setText for argument #2 ($string2)" , 0 );
1055
+ efree (ustring2 );
1056
+ efree (ustring1 );
1057
+ ubrk_close (bi2 );
1058
+ ubrk_close (bi1 );
1059
+ RETURN_FALSE ;
1060
+ }
1061
+ UCollator * collator = ucol_open ("" , & ustatus );
1062
+ if (U_FAILURE (ustatus )) {
1063
+ intl_error_set_code (NULL , ustatus );
1064
+
1065
+ intl_error_set_custom_msg (NULL , "Error on ucol_open" , 0 );
1066
+ efree (ustring2 );
1067
+ efree (ustring1 );
1068
+ ubrk_close (bi2 );
1069
+ ubrk_close (bi1 );
1070
+ ucol_close (collator );
1071
+ RETURN_FALSE ;
1072
+ }
1073
+
1074
+ zend_long * p1 , * p2 , * tmp ;
1075
+ p1 = safe_emalloc (strlen_2 + 1 , sizeof (zend_long ), 0 );
1076
+ p2 = safe_emalloc (strlen_2 + 1 , sizeof (zend_long ), 0 );
1077
+
1078
+ for (i2 = 0 ; i2 <= strlen_2 ; i2 ++ ) {
1079
+ p1 [i2 ] = i2 * cost_ins ;
1080
+ }
1081
+
1082
+ int32_t current1 = 0 ;
1083
+ int32_t current2 = 0 ;
1084
+ int32_t pos1 = 0 ;
1085
+ int32_t pos2 = 0 ;
1086
+
1087
+ while (true) {
1088
+ current1 = ubrk_current (bi1 );
1089
+ pos1 = ubrk_next (bi1 );
1090
+ if (pos1 == UBRK_DONE ) {
1091
+ break ;
1092
+ }
1093
+ p2 [0 ] = p1 [0 ] + cost_del ;
1094
+ for (i2 = 0 , pos2 = 0 ; pos2 != UBRK_DONE ; i2 ++ ) {
1095
+ current2 = ubrk_current (bi2 );
1096
+ pos2 = ubrk_next (bi2 );
1097
+ if (pos2 == UBRK_DONE ) {
1098
+ break ;
1099
+ }
1100
+ if (ucol_strcoll (collator , ustring1 + current1 , pos1 - current1 , ustring2 + current2 , pos2 - current2 ) == UCOL_EQUAL ) {
1101
+ c0 = p1 [i2 ];
1102
+ } else {
1103
+ c0 = p1 [i2 ] + cost_rep ;
1104
+ }
1105
+ c1 = p1 [i2 + 1 ] + cost_del ;
1106
+ if (c1 < c0 ) {
1107
+ c0 = c1 ;
1108
+ }
1109
+ c2 = p2 [i2 ] + cost_ins ;
1110
+ if (c2 < c0 ) {
1111
+ c0 = c2 ;
1112
+ }
1113
+ p2 [i2 + 1 ] = c0 ;
1114
+ }
1115
+ ubrk_first (bi2 );
1116
+ tmp = p1 ;
1117
+ p1 = p2 ;
1118
+ p2 = tmp ;
1119
+ }
1120
+
1121
+ ucol_close (collator );
1122
+
1123
+ ubrk_close (bi1 );
1124
+ ubrk_close (bi2 );
1125
+
1126
+ efree (ustring1 );
1127
+ efree (ustring2 );
1128
+
1129
+ retval = p1 [strlen_2 ];
1130
+
1131
+ efree (p1 );
1132
+ efree (p2 );
1133
+ RETURN_LONG (retval );
1134
+ }
1135
+
921
1136
/* }}} */
0 commit comments