@@ -36,32 +36,55 @@ cpdef cnp.ndarray calculations(object reference, object hypothesis):
3636 cdef list hypothesis_word = hypothesis.split()
3737
3838 # Use Py_ssize_t for indices and sizes
39+ # Py_ssize_t matches Python's internal index type and avoids unnecessary
40+ # casts or overflow risks when working with Python lists and memoryviews.
3941 cdef Py_ssize_t m = len (reference_word)
4042 cdef Py_ssize_t n = len (hypothesis_word)
4143 cdef Py_ssize_t i, j
42- cdef int substitution_cost, ld, insertions, deletions, substitutions
44+
45+ # Metrics and outputs
46+ cdef int ld, insertions, deletions, substitutions
47+ cdef double wer
4348 cdef list inserted_words, deleted_words, substituted_words
4449
50+ # Variables for optimized DP loop
51+ cdef int cost, del_cost, ins_cost, sub_cost, best
52+
4553 # Initialize the Levenshtein distance matrix
46- cdef int [:, :] ldm = np.zeros((m + 1 , n + 1 ), dtype = np.int32)
54+ # Use empty instead of zeros to avoid redundant initialization.
55+ # SAFETY: All cells are explicitly initialized below (row 0, col 0, then DP loop).
56+ # Allocate the (m+1) x (n+1) DP matrix without zero-initialization to avoid
57+ # redundant memory writes. Boundary conditions are initialized explicitly.
58+ cdef int [:, :] ldm = np.empty((m + 1 , n + 1 ), dtype = np.int32)
4759
48- # Fill the Levenshtein distance matrix
60+ # Initialize first column and first row (boundary conditions)
4961 for i in range (m + 1 ):
50- for j in range (n + 1 ):
51- if i == 0 :
52- ldm[i, j] = j
53- elif j == 0 :
54- ldm[i, j] = i
55- else :
56- substitution_cost = 0 if reference_word[i - 1 ] == hypothesis_word[j - 1 ] else 1
57- ldm[i, j] = min (
58- ldm[i - 1 , j] + 1 , # Deletion
59- ldm[i, j - 1 ] + 1 , # Insertion
60- ldm[i - 1 , j - 1 ] + substitution_cost # Substitution
61- )
62+ ldm[i, 0 ] = < int > i
63+ for j in range (n + 1 ):
64+ ldm[0 , j] = < int > j
65+
66+ # Fill the Levenshtein distance matrix
67+ # Compute edit distances using a branch-free inner loop and manual minimum
68+ # selection to keep all operations at C level and minimize per-cell overhead.
69+ # No boundary condition branches in the hot loop, manual min selection.
70+ for i in range (1 , m + 1 ):
71+ for j in range (1 , n + 1 ):
72+ cost = 0 if reference_word[i - 1 ] == hypothesis_word[j - 1 ] else 1
73+
74+ del_cost = ldm[i - 1 , j] + 1
75+ ins_cost = ldm[i, j - 1 ] + 1
76+ sub_cost = ldm[i - 1 , j - 1 ] + cost
77+
78+ best = del_cost
79+ if ins_cost < best:
80+ best = ins_cost
81+ if sub_cost < best:
82+ best = sub_cost
83+
84+ ldm[i, j] = best
6285
6386 ld = ldm[m, n]
64- wer = ld / m
87+ wer = ( < double > ld) / m
6588
6689 insertions, deletions, substitutions = 0 , 0 , 0
6790 inserted_words, deleted_words, substituted_words = [], [], []
0 commit comments