KompareDiff2

levenshteintable.h
1/*
2SPDX-FileCopyrightText: 2003 Otto Bruggeman <bruggie@home.nl>
3SPDX-FileCopyrightText: 2011 Dmitry Risenberg <dmitry.risenberg@gmail.com>
4
5SPDX-License-Identifier: GPL-2.0-or-later
6*/
7
8#ifndef KOMPAREDIFF2_LEVENSHTEINTABLE_H
9#define KOMPAREDIFF2_LEVENSHTEINTABLE_H
10
11#include <iostream>
12// #include <QString>
13// #include <komparediffdebug.h>
14
15namespace Diff2 {
16
17class Marker;
18
19
20class Marker;
21
22/**
23 * Computes the Levenshtein distance between two sequences.
24 * The actual sequence contents must be prepended with one virtual item each for easier index access.
25 */
26template<class SequencePair> class LevenshteinTable
27{
28public:
30 LevenshteinTable(unsigned int width, unsigned int height);
32
33public:
34 int getContent(unsigned int posX, unsigned int posY) const;
35 int setContent(unsigned int posX, unsigned int posY, int value);
36 bool setSize(unsigned int width, unsigned int height);
37
38 unsigned int width() const { return m_width; };
39 unsigned int height() const { return m_height; };
40
41 /** Debug method to check if the table is properly filled */
42 void dumpLevenshteinTable(void);
43
44 /**
45 * This calculates the levenshtein distance of 2 sequences.
46 * This object takes ownership of the argument
47 */
48 unsigned int createTable(SequencePair* sequences);
49
50 void createListsOfMarkers(void);
51 int chooseRoute(int c1, int c2, int c3, int current);
52
53protected:
55 const LevenshteinTable& operator = (const LevenshteinTable& table);
56
57private:
58 unsigned int m_width;
59 unsigned int m_height;
60 unsigned int m_size;
61 unsigned int* m_table;
62 SequencePair* m_sequences;
63};
64
65template<class SequencePair> LevenshteinTable<SequencePair>::LevenshteinTable()
66 : m_width(256),
67 m_height(256),
68 m_size(m_height* m_width),
69 m_table(new unsigned int[ m_size ]),
70 m_sequences(nullptr)
71{
72}
73
74template<class SequencePair> LevenshteinTable<SequencePair>::LevenshteinTable(unsigned int width, unsigned int height)
75 : m_width(width),
76 m_height(height),
77 m_size(m_width* m_height),
78 m_table(new unsigned int[ m_size ]),
79 m_sequences(0)
80{
81}
82
83template<class SequencePair> LevenshteinTable<SequencePair>::~LevenshteinTable()
84{
85 delete[] m_table;
86 delete m_sequences;
87}
88
89template<class SequencePair> int LevenshteinTable<SequencePair>::getContent(unsigned int posX, unsigned int posY) const
90{
91// qCDebug(LIBKOMPAREDIFF2) << "Width = " << m_width << ", height = " << m_height << ", posX = " << posX << ", posY = " << posY;
92 return m_table[ posY * m_width + posX ];
93}
94
95template<class SequencePair> int LevenshteinTable<SequencePair>::setContent(unsigned int posX, unsigned int posY, int value)
96{
97 m_table[ posY * m_width + posX ] = value;
98
99 return 0;
100}
101
102template<class SequencePair> bool LevenshteinTable<SequencePair>::setSize(unsigned int width, unsigned int height)
103{
104 // Set a limit of 16.7 million entries, will be about 64 MB of ram, that should be plenty
105 if (((width) * (height)) > (256 * 256 * 256))
106 return false;
107
108 if (((width) * (height)) > m_size)
109 {
110 delete[] m_table;
111
112 m_size = width * height;
113 m_table = new unsigned int[ m_size ];
114 }
115
116 m_width = width;
117 m_height = height;
118
119 return true;
120}
121
123{
124 for (unsigned int i = 0; i < m_height; ++i)
125 {
126 for (unsigned int j = 0; j < m_width; ++j)
127 {
128 std::cout.width(3);
129 std::cout << getContent(j, i);
130 }
131 std::cout << std::endl;
132 }
133}
134
135template<class SequencePair> unsigned int LevenshteinTable<SequencePair>::createTable(SequencePair* sequences)
136{
137 m_sequences = sequences;
138 unsigned int m = m_sequences->lengthFirst();
139 unsigned int n = m_sequences->lengthSecond();
140
141 if (!setSize(m, n))
142 return 0;
143
144 unsigned int i;
145 unsigned int j;
146
147 // initialize first row
148 for (i = 0; i < m; ++i)
149 setContent(i, 0, i);
150 // initialize first column
151 for (j = 0; j < n; ++j)
152 setContent(0, j, j);
153
154 int cost = 0, north = 0, west = 0, northwest = 0;
155
156 // Optimization, calculate row wise instead of column wise, wont trash the cache so much with large strings
157 for (j = 1; j < n; ++j)
158 {
159 for (i = 1; i < m; ++i)
160 {
161 if (m_sequences->equal(i, j))
162 cost = 0;
163 else
164 cost = SequencePair::allowReplace ? 1 : 2;
165
166 north = getContent(i, j - 1) + 1;
167 west = getContent(i - 1, j) + 1;
168 northwest = getContent(i - 1, j - 1) + cost;
169
170 setContent(i, j, qMin(north, qMin(west, northwest)));
171 }
172 }
173
174 return getContent(m - 1, n - 1);
175}
176
177template<class SequencePair> int LevenshteinTable<SequencePair>::chooseRoute(int c1, int c2, int c3, int current)
178{
179// qCDebug(LIBKOMPAREDIFF2) << "c1 = " << c1 << ", c2 = " << c2 << ", c3 = " << c3;
180 // preference order: c2, c3, c1, hopefully this will work out for me
181 if (c2 <= c1 && c2 <= c3)
182 {
183 if (SequencePair::allowReplace || (c2 == current))
184 {
185 return 1;
186 }
187 }
188
189 if (c3 <= c2 && c3 <= c1)
190 return 2;
191
192 return 0;
193}
194
195template<class SequencePair> void LevenshteinTable<SequencePair>::createListsOfMarkers()
196{
197// qCDebug(LIBKOMPAREDIFF2) << source;
198// qCDebug(LIBKOMPAREDIFF2) << destination;
199// dumpLevenshteinTable();
200
201 unsigned int x = m_width - 1;
202 unsigned int y = m_height - 1;
203
204 unsigned int difference = getContent(x, y);
205
206 // If the number of differences is more than half the length of the largest string
207 // don't bother to mark the individual changes
208 // Patch based on work by Felix Berger as put as attachment to bug 75794
209 if (!m_sequences->needFineGrainedOutput(difference))
210 {
211 m_sequences->prependFirst(new Marker(Marker::End, x));
212 m_sequences->prependFirst(new Marker(Marker::Start, 0));
213 m_sequences->prependSecond(new Marker(Marker::End, y));
214 m_sequences->prependSecond(new Marker(Marker::Start, 0));
215 return;
216 }
217
218 Marker* c = nullptr;
219
220 int n, nw, w, direction, currentValue;
221 while (x > 0 && y > 0)
222 {
223 currentValue = getContent(x, y);
224
225 n = getContent(x, y - 1);
226 w = getContent(x - 1, y);
227 nw = getContent(x - 1, y - 1);
228 direction = chooseRoute(n, nw, w, currentValue);
229
230 switch (direction)
231 {
232 case 0: // north
233// qCDebug(LIBKOMPAREDIFF2) << "Picking north";
234// qCDebug(LIBKOMPAREDIFF2) << "Source[" << ( x - 1 ) << "] = " << QString( source[ x-1 ] ) << ", destination[" << ( y - 1 ) << "] = " << QString( destination[ y-1 ] );
235
236 if (!m_sequences->markerListSecond().isEmpty())
237 c = m_sequences->markerListSecond().first();
238 else
239 c = nullptr;
240
241 if (c && c->type() == Marker::End)
242 {
243// qCDebug(LIBKOMPAREDIFF2) << "CurrentValue: " << currentValue;
244 if (n == currentValue)
245 m_sequences->prependSecond(new Marker(Marker::Start, y));
246// else: the change continues, do not do anything
247 }
248 else
249 {
250// qCDebug(LIBKOMPAREDIFF2) << "CurrentValue: " << currentValue;
251 if (n < currentValue)
252 m_sequences->prependSecond(new Marker(Marker::End, y));
253 }
254
255 --y;
256 if (y == 0) {
257 m_sequences->prependSecond(new Marker(Marker::Start, 0));
258 }
259 break;
260 case 1: // northwest
261// qCDebug(LIBKOMPAREDIFF2) << "Picking northwest";
262// qCDebug(LIBKOMPAREDIFF2) << "Source[" << ( x - 1 ) << "] = " << QString( source[ x-1 ] ) << ", destination[" << ( y - 1 ) << "] = " << QString( destination[ y-1 ] );
263
264 if (!m_sequences->markerListSecond().isEmpty())
265 c = m_sequences->markerListSecond().first();
266 else
267 c = nullptr;
268
269 if (c && c->type() == Marker::End)
270 {
271// qCDebug(LIBKOMPAREDIFF2) << "End found: CurrentValue: " << currentValue;
272 if (nw == currentValue)
273 m_sequences->prependSecond(new Marker(Marker::Start, y));
274 // else: the change continues, do not do anything
275 }
276 else
277 {
278// qCDebug(LIBKOMPAREDIFF2) << "CurrentValue: " << currentValue;
279 if (nw < currentValue)
280 m_sequences->prependSecond(new Marker(Marker::End, y));
281 }
282
283 if (!m_sequences->markerListFirst().isEmpty())
284 c = m_sequences->markerListFirst().first();
285 else
286 c = nullptr;
287
288 if (c && c->type() == Marker::End)
289 {
290// qCDebug(LIBKOMPAREDIFF2) << "End found: CurrentValue: " << currentValue;
291 if (nw == currentValue)
292 m_sequences->prependFirst(new Marker(Marker::Start, x));
293 // else: the change continues, do not do anything
294 }
295 else
296 {
297// qCDebug(LIBKOMPAREDIFF2) << "CurrentValue: " << currentValue;
298 if (nw < currentValue)
299 m_sequences->prependFirst(new Marker(Marker::End, x));
300 }
301
302 --y;
303 --x;
304 break;
305 case 2: // west
306// qCDebug(LIBKOMPAREDIFF2) << "Picking west";
307// qCDebug(LIBKOMPAREDIFF2) << "Source[" << ( x - 1 ) << "] = " << QString( source[ x-1 ] ) << ", destination[" << ( y - 1 ) << "] = " << QString( destination[ y-1 ] );
308
309 if (!m_sequences->markerListFirst().isEmpty())
310 c = m_sequences->markerListFirst().first();
311 else
312 c = nullptr;
313
314 if (c && c->type() == Marker::End)
315 {
316// qCDebug(LIBKOMPAREDIFF2) << "End found: CurrentValue: " << currentValue;
317 if (w == currentValue)
318 m_sequences->prependFirst(new Marker(Marker::Start, x));
319 // else: the change continues, do not do anything
320 }
321 else
322 {
323// qCDebug(LIBKOMPAREDIFF2) << "CurrentValue: " << currentValue;
324 if (w < currentValue)
325 m_sequences->prependFirst(new Marker(Marker::End, x));
326 }
327
328 --x;
329 if (x == 0) {
330 m_sequences->prependFirst(new Marker(Marker::Start, 0));
331 }
332 break;
333 }
334 }
335
336 // When leaving the loop it does not mean both are 0! If not there is still a change at the beginning of the line we missed so adding now.
337 if (x != 0)
338 {
339 m_sequences->prependFirst(new Marker(Marker::End, x));
340 m_sequences->prependFirst(new Marker(Marker::Start, 0));
341 }
342
343 if (y != 0)
344 {
345 m_sequences->prependSecond(new Marker(Marker::End, y));
346 m_sequences->prependSecond(new Marker(Marker::Start, 0));
347 }
348
349// qCDebug(LIBKOMPAREDIFF2) << "Source string: " << source;
350
351// QStringList list;
352// int prevValue = 0;
353// MarkerListConstIterator mit = m_sequences->markerListFirst().begin();
354// MarkerListConstIterator end = m_sequences->markerListFirst().end();
355// for ( ; mit != end; ++mit )
356// {
357// c = *mit;
358// qCDebug(LIBKOMPAREDIFF2) << "Source Marker Entry : Type: " << c->type() << ", Offset: " << c->offset();
359// list.append( source.mid( prevValue, c->offset() - prevValue ) );
360// prevValue = c->offset();
361// }
362// if ( prevValue < source.length() - 1 )
363// {
364// list.append( source.mid( prevValue, source.length() - prevValue ) );
365// }
366// qCDebug(LIBKOMPAREDIFF2) << "Source Resulting stringlist : " << list.join("\n");
367//
368// list.clear();
369// prevValue = 0;
370//
371// qCDebug(LIBKOMPAREDIFF2) << "Destination string: " << destination;
372// mit = m_sequences->markerListSecond().begin();
373// end = m_sequences->markerListSecond().end();
374// for ( ; mit != end; ++mit )
375// {
376// c = *mit;
377// qCDebug(LIBKOMPAREDIFF2) << "Destination Marker Entry : Type: " << c->type() << ", Offset: " << c->offset();
378// list.append( destination.mid( prevValue, c->offset() - prevValue ) );
379// prevValue = c->offset();
380// }
381// if ( prevValue < destination.length() - 1 )
382// {
383// list.append( destination.mid( prevValue, destination.length() - prevValue ) );
384// }
385// qCDebug(LIBKOMPAREDIFF2) << "Destination Resulting string : " << list.join("\n");
386}
387
388} // namespace Diff2
389
390#endif // KOMPAREDIFF2_LEVENSHTEINTABLE_H
Computes the Levenshtein distance between two sequences.
unsigned int createTable(SequencePair *sequences)
This calculates the levenshtein distance of 2 sequences.
void dumpLevenshteinTable(void)
Debug method to check if the table is properly filled.
Diff2 namespace.
This file is part of the KDE documentation.
Documentation copyright © 1996-2024 The KDE developers.
Generated on Sat Apr 27 2024 22:10:24 by doxygen 1.10.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.