KompareDiff2

levenshteintable.h
1/*
2 SPDX-FileCopyrightText: 2003 Otto Bruggeman <bruggie@home.nl>
3 SPDX-FileCopyrightText: 2011 Dmitry Risenberg <dmitry.risenberg@gmail.com>
4
5 SPDX-License-Identifier: GPL-2.0-or-later
6*/
7
8#ifndef KOMPAREDIFF2_LEVENSHTEINTABLE_H
9#define KOMPAREDIFF2_LEVENSHTEINTABLE_H
10
11// #include <QString>
12// #include <komparediff2_logging.h>
13// Std
14#include <iostream>
15#include <memory>
16#include <vector>
17
18namespace KompareDiff2
19{
20
21class Marker;
22
23class Marker;
24
25/**
26 * Computes the Levenshtein distance between two sequences.
27 * The actual sequence contents must be prepended with one virtual item each for easier index access.
28 */
29template<class SequencePair>
31{
32public:
34 LevenshteinTable(unsigned int width, unsigned int height);
35 ~LevenshteinTable() = default;
36
37 LevenshteinTable(const LevenshteinTable &table) = delete;
38 const LevenshteinTable &operator=(const LevenshteinTable &table) = delete;
39
40public:
41 int getContent(unsigned int posX, unsigned int posY) const;
42 int setContent(unsigned int posX, unsigned int posY, int value);
43 bool setSize(unsigned int width, unsigned int height);
44
45 unsigned int width() const
46 {
47 return m_width;
48 };
49 unsigned int height() const
50 {
51 return m_height;
52 };
53
54 /** Debug method to check if the table is properly filled */
56
57 /**
58 * This calculates the levenshtein distance of 2 sequences.
59 * This object takes ownership of the argument
60 */
61 unsigned int createTable(SequencePair *sequences);
62
63 void createListsOfMarkers();
64 int chooseRoute(int c1, int c2, int c3, int current);
65
66private:
67 unsigned int m_width = 256;
68 unsigned int m_height = 256;
69 unsigned int m_size;
70 std::vector<unsigned int> m_table;
71 std::unique_ptr<SequencePair> m_sequences;
72};
73
74template<class SequencePair>
76 : m_size(m_height * m_width)
77 , m_table(m_size)
78{
79}
80
81template<class SequencePair>
82LevenshteinTable<SequencePair>::LevenshteinTable(unsigned int width, unsigned int height)
83 : m_width(width)
84 , m_height(height)
85 , m_size(m_width * m_height)
86 , m_table(m_size)
87{
88}
89
90template<class SequencePair>
91int LevenshteinTable<SequencePair>::getContent(unsigned int posX, unsigned int posY) const
92{
93// qCDebug(KOMPAREDIFF2_LOG) << "Width = " << m_width << ", height = " << m_height << ", posX = " << posX << ", posY = " << posY;
94 return m_table[posY * m_width + posX];
95}
96
97template<class SequencePair>
98int LevenshteinTable<SequencePair>::setContent(unsigned int posX, unsigned int posY, int value)
99{
100 m_table[posY * m_width + posX] = value;
101
102 return 0;
103}
104
105template<class SequencePair>
106bool LevenshteinTable<SequencePair>::setSize(unsigned int width, unsigned int height)
107{
108 // Set a limit of 16.7 million entries, will be about 64 MB of ram, that should be plenty
109 if (((width) * (height)) > (256 * 256 * 256))
110 return false;
111
112 if (((width) * (height)) > m_size) {
113 m_size = width * height;
114 m_table.resize(m_size);
115 }
116
117 m_width = width;
118 m_height = height;
119
120 return true;
121}
122
123template<class SequencePair>
125{
126 for (unsigned int i = 0; i < m_height; ++i) {
127 for (unsigned int j = 0; j < m_width; ++j) {
128 std::cout.width(3);
129 std::cout << getContent(j, i);
130 }
131 std::cout << std::endl;
132 }
133}
134
135template<class SequencePair>
136unsigned int LevenshteinTable<SequencePair>::createTable(SequencePair *sequences)
137{
138 m_sequences.reset(sequences);
139 unsigned int m = m_sequences->lengthFirst();
140 unsigned int n = m_sequences->lengthSecond();
141
142 if (!setSize(m, n))
143 return 0;
144
145 unsigned int i;
146 unsigned int j;
147
148 // initialize first row
149 for (i = 0; i < m; ++i)
150 setContent(i, 0, i);
151 // initialize first column
152 for (j = 0; j < n; ++j)
153 setContent(0, j, j);
154
155 int cost = 0, north = 0, west = 0, northwest = 0;
156
157 // Optimization, calculate row wise instead of column wise, wont trash the cache so much with large strings
158 for (j = 1; j < n; ++j) {
159 for (i = 1; i < m; ++i) {
160 if (m_sequences->equal(i, j))
161 cost = 0;
162 else
163 cost = SequencePair::allowReplace ? 1 : 2;
164
165 north = getContent(i, j - 1) + 1;
166 west = getContent(i - 1, j) + 1;
167 northwest = getContent(i - 1, j - 1) + cost;
168
169 setContent(i, j, qMin(north, qMin(west, northwest)));
170 }
171 }
172
173 return getContent(m - 1, n - 1);
174}
175
176template<class SequencePair>
177int LevenshteinTable<SequencePair>::chooseRoute(int c1, int c2, int c3, int current)
178{
179// qCDebug(KOMPAREDIFF2_LOG) << "c1 = " << c1 << ", c2 = " << c2 << ", c3 = " << c3;
180 // preference order: c2, c3, c1, hopefully this will work out for me
181 if (c2 <= c1 && c2 <= c3) {
182 if (SequencePair::allowReplace || (c2 == current)) {
183 return 1;
184 }
185 }
186
187 if (c3 <= c2 && c3 <= c1)
188 return 2;
189
190 return 0;
191}
192
193template<class SequencePair>
194void LevenshteinTable<SequencePair>::createListsOfMarkers()
195{
196// qCDebug(KOMPAREDIFF2_LOG) << source;
197// qCDebug(KOMPAREDIFF2_LOG) << destination;
198// dumpLevenshteinTable();
199
200 unsigned int x = m_width - 1;
201 unsigned int y = m_height - 1;
202
203 unsigned int difference = getContent(x, y);
204
205 // If the number of differences is more than half the length of the largest string
206 // don't bother to mark the individual changes
207 // Patch based on work by Felix Berger as put as attachment to bug 75794
208 if (!m_sequences->needFineGrainedOutput(difference)) {
209 m_sequences->prependFirst(new Marker(Marker::End, x));
210 m_sequences->prependFirst(new Marker(Marker::Start, 0));
211 m_sequences->prependSecond(new Marker(Marker::End, y));
212 m_sequences->prependSecond(new Marker(Marker::Start, 0));
213 return;
214 }
215
216 Marker *c = nullptr;
217
218 int n, nw, w, direction, currentValue;
219 while (x > 0 && y > 0) {
220 currentValue = getContent(x, y);
221
222 n = getContent(x, y - 1);
223 w = getContent(x - 1, y);
224 nw = getContent(x - 1, y - 1);
225 direction = chooseRoute(n, nw, w, currentValue);
226
227 switch (direction) {
228 case 0: // north
229// qCDebug(KOMPAREDIFF2_LOG) << "Picking north";
230// qCDebug(KOMPAREDIFF2_LOG) << "Source[" << ( x - 1 ) << "] = " << QString( source[ x-1 ] ) << ", destination[" << ( y - 1 ) << "] = " << QString( destination[ y-1 ] );
231
232 if (!m_sequences->markerListSecond().isEmpty())
233 c = m_sequences->markerListSecond().first();
234 else
235 c = nullptr;
236
237 if (c && c->type() == Marker::End) {
238// qCDebug(KOMPAREDIFF2_LOG) << "CurrentValue: " << currentValue;
239 if (n == currentValue)
240 m_sequences->prependSecond(new Marker(Marker::Start, y));
241 // else: the change continues, do not do anything
242 } else {
243// qCDebug(KOMPAREDIFF2_LOG) << "CurrentValue: " << currentValue;
244 if (n < currentValue)
245 m_sequences->prependSecond(new Marker(Marker::End, y));
246 }
247
248 --y;
249 if (y == 0) {
250 m_sequences->prependSecond(new Marker(Marker::Start, 0));
251 }
252 break;
253 case 1: // northwest
254// qCDebug(KOMPAREDIFF2_LOG) << "Picking northwest";
255// qCDebug(KOMPAREDIFF2_LOG) << "Source[" << ( x - 1 ) << "] = " << QString( source[ x-1 ] ) << ", destination[" << ( y - 1 ) << "] = " << QString( destination[ y-1 ] );
256
257 if (!m_sequences->markerListSecond().isEmpty())
258 c = m_sequences->markerListSecond().first();
259 else
260 c = nullptr;
261
262 if (c && c->type() == Marker::End) {
263// qCDebug(KOMPAREDIFF2_LOG) << "End found: CurrentValue: " << currentValue;
264 if (nw == currentValue)
265 m_sequences->prependSecond(new Marker(Marker::Start, y));
266 // else: the change continues, do not do anything
267 } else {
268// qCDebug(KOMPAREDIFF2_LOG) << "CurrentValue: " << currentValue;
269 if (nw < currentValue)
270 m_sequences->prependSecond(new Marker(Marker::End, y));
271 }
272
273 if (!m_sequences->markerListFirst().isEmpty())
274 c = m_sequences->markerListFirst().first();
275 else
276 c = nullptr;
277
278 if (c && c->type() == Marker::End) {
279// qCDebug(KOMPAREDIFF2_LOG) << "End found: CurrentValue: " << currentValue;
280 if (nw == currentValue)
281 m_sequences->prependFirst(new Marker(Marker::Start, x));
282 // else: the change continues, do not do anything
283 } else {
284// qCDebug(KOMPAREDIFF2_LOG) << "CurrentValue: " << currentValue;
285 if (nw < currentValue)
286 m_sequences->prependFirst(new Marker(Marker::End, x));
287 }
288
289 --y;
290 --x;
291 break;
292 case 2: // west
293// qCDebug(KOMPAREDIFF2_LOG) << "Picking west";
294// qCDebug(KOMPAREDIFF2_LOG) << "Source[" << ( x - 1 ) << "] = " << QString( source[ x-1 ] ) << ", destination[" << ( y - 1 ) << "] = " << QString( destination[ y-1 ] );
295
296 if (!m_sequences->markerListFirst().isEmpty())
297 c = m_sequences->markerListFirst().first();
298 else
299 c = nullptr;
300
301 if (c && c->type() == Marker::End) {
302// qCDebug(KOMPAREDIFF2_LOG) << "End found: CurrentValue: " << currentValue;
303 if (w == currentValue)
304 m_sequences->prependFirst(new Marker(Marker::Start, x));
305 // else: the change continues, do not do anything
306 } else {
307// qCDebug(KOMPAREDIFF2_LOG) << "CurrentValue: " << currentValue;
308 if (w < currentValue)
309 m_sequences->prependFirst(new Marker(Marker::End, x));
310 }
311
312 --x;
313 if (x == 0) {
314 m_sequences->prependFirst(new Marker(Marker::Start, 0));
315 }
316 break;
317 }
318 }
319
320 // When leaving the loop it does not mean both are 0! If not there is still a change at the beginning of the line we missed so adding now.
321 if (x != 0) {
322 m_sequences->prependFirst(new Marker(Marker::End, x));
323 m_sequences->prependFirst(new Marker(Marker::Start, 0));
324 }
325
326 if (y != 0) {
327 m_sequences->prependSecond(new Marker(Marker::End, y));
328 m_sequences->prependSecond(new Marker(Marker::Start, 0));
329 }
330
331// qCDebug(KOMPAREDIFF2_LOG) << "Source string: " << source;
332
333// QStringList list;
334// int prevValue = 0;
335// MarkerListConstIterator mit = m_sequences->markerListFirst().begin();
336// MarkerListConstIterator end = m_sequences->markerListFirst().end();
337// for ( ; mit != end; ++mit )
338// {
339// c = *mit;
340// qCDebug(KOMPAREDIFF2_LOG) << "Source Marker Entry : Type: " << c->type() << ", Offset: " << c->offset();
341// list.append( source.mid( prevValue, c->offset() - prevValue ) );
342// prevValue = c->offset();
343// }
344// if ( prevValue < source.length() - 1 )
345// {
346// list.append( source.mid( prevValue, source.length() - prevValue ) );
347// }
348// qCDebug(KOMPAREDIFF2_LOG) << "Source Resulting stringlist : " << list.join("\n");
349//
350// list.clear();
351// prevValue = 0;
352//
353// qCDebug(KOMPAREDIFF2_LOG) << "Destination string: " << destination;
354// mit = m_sequences->markerListSecond().begin();
355// end = m_sequences->markerListSecond().end();
356// for ( ; mit != end; ++mit )
357// {
358// c = *mit;
359// qCDebug(KOMPAREDIFF2_LOG) << "Destination Marker Entry : Type: " << c->type() << ", Offset: " << c->offset();
360// list.append( destination.mid( prevValue, c->offset() - prevValue ) );
361// prevValue = c->offset();
362// }
363// if ( prevValue < destination.length() - 1 )
364// {
365// list.append( destination.mid( prevValue, destination.length() - prevValue ) );
366// }
367// qCDebug(KOMPAREDIFF2_LOG) << "Destination Resulting string : " << list.join("\n");
368}
369
370} // namespace KompareDiff2
371
372#endif // KOMPAREDIFF2_LEVENSHTEINTABLE_H
Computes the Levenshtein distance between two sequences.
void dumpLevenshteinTable()
Debug method to check if the table is properly filled.
unsigned int createTable(SequencePair *sequences)
This calculates the levenshtein distance of 2 sequences.
KompareDiff2 namespace.
This file is part of the KDE documentation.
Documentation copyright © 1996-2024 The KDE developers.
Generated on Fri Nov 29 2024 11:50:47 by doxygen 1.12.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.