KompareDiff2

levenshteintable.h
1 /*
2 SPDX-FileCopyrightText: 2003 Otto Bruggeman <[email protected]>
3 SPDX-FileCopyrightText: 2011 Dmitry Risenberg <[email protected]>
4 
5 SPDX-License-Identifier: GPL-2.0-or-later
6 */
7 
8 #ifndef LEVENSHTEIN_H
9 #define LEVENSHTEIN_H
10 
11 #include <iostream>
12 // #include <QString>
13 // #include <komparediffdebug.h>
14 
15 namespace Diff2 {
16 
17 class Marker;
18 
19 
20 class Marker;
21 
22 /**
23  * Computes the Levenshtein distance between two sequences.
24  * The actual sequence contents must be prepended with one virtual item each for easier index access.
25  */
26 template<class SequencePair> class LevenshteinTable
27 {
28 public:
30  LevenshteinTable(unsigned int width, unsigned int height);
32 
33 public:
34  int getContent(unsigned int posX, unsigned int posY) const;
35  int setContent(unsigned int posX, unsigned int posY, int value);
36  bool setSize(unsigned int width, unsigned int height);
37 
38  unsigned int width() const { return m_width; };
39  unsigned int height() const { return m_height; };
40 
41  /** Debug method to check if the table is properly filled */
42  void dumpLevenshteinTable(void);
43 
44  /**
45  * This calculates the levenshtein distance of 2 sequences.
46  * This object takes ownership of the argument
47  */
48  unsigned int createTable(SequencePair* sequences);
49 
50  void createListsOfMarkers(void);
51  int chooseRoute(int c1, int c2, int c3, int current);
52 
53 protected:
54  LevenshteinTable(const LevenshteinTable& table);
55  const LevenshteinTable& operator = (const LevenshteinTable& table);
56 
57 private:
58  unsigned int m_width;
59  unsigned int m_height;
60  unsigned int m_size;
61  unsigned int* m_table;
62  SequencePair* m_sequences;
63 };
64 
65 template<class SequencePair> LevenshteinTable<SequencePair>::LevenshteinTable()
66  : m_width(256),
67  m_height(256),
68  m_size(m_height* m_width),
69  m_table(new unsigned int[ m_size ]),
70  m_sequences(nullptr)
71 {
72 }
73 
74 template<class SequencePair> LevenshteinTable<SequencePair>::LevenshteinTable(unsigned int width, unsigned int height)
75  : m_width(width),
76  m_height(height),
77  m_size(m_width* m_height),
78  m_table(new unsigned int[ m_size ]),
79  m_sequences(0)
80 {
81 }
82 
83 template<class SequencePair> LevenshteinTable<SequencePair>::~LevenshteinTable()
84 {
85  delete[] m_table;
86  delete m_sequences;
87 }
88 
89 template<class SequencePair> int LevenshteinTable<SequencePair>::getContent(unsigned int posX, unsigned int posY) const
90 {
91 // qCDebug(LIBKOMPAREDIFF2) << "Width = " << m_width << ", height = " << m_height << ", posX = " << posX << ", posY = " << posY;
92  return m_table[ posY * m_width + posX ];
93 }
94 
95 template<class SequencePair> int LevenshteinTable<SequencePair>::setContent(unsigned int posX, unsigned int posY, int value)
96 {
97  m_table[ posY * m_width + posX ] = value;
98 
99  return 0;
100 }
101 
102 template<class SequencePair> bool LevenshteinTable<SequencePair>::setSize(unsigned int width, unsigned int height)
103 {
104  // Set a limit of 16.7 million entries, will be about 64 MB of ram, that should be plenty
105  if (((width) * (height)) > (256 * 256 * 256))
106  return false;
107 
108  if (((width) * (height)) > m_size)
109  {
110  delete[] m_table;
111 
112  m_size = width * height;
113  m_table = new unsigned int[ m_size ];
114  }
115 
116  m_width = width;
117  m_height = height;
118 
119  return true;
120 }
121 
122 template<class SequencePair> void LevenshteinTable<SequencePair>::dumpLevenshteinTable()
123 {
124  for (unsigned int i = 0; i < m_height; ++i)
125  {
126  for (unsigned int j = 0; j < m_width; ++j)
127  {
128  std::cout.width(3);
129  std::cout << getContent(j, i);
130  }
131  std::cout << std::endl;
132  }
133 }
134 
135 template<class SequencePair> unsigned int LevenshteinTable<SequencePair>::createTable(SequencePair* sequences)
136 {
137  m_sequences = sequences;
138  unsigned int m = m_sequences->lengthFirst();
139  unsigned int n = m_sequences->lengthSecond();
140 
141  if (!setSize(m, n))
142  return 0;
143 
144  unsigned int i;
145  unsigned int j;
146 
147  // initialize first row
148  for (i = 0; i < m; ++i)
149  setContent(i, 0, i);
150  // initialize first column
151  for (j = 0; j < n; ++j)
152  setContent(0, j, j);
153 
154  int cost = 0, north = 0, west = 0, northwest = 0;
155 
156  // Optimization, calculate row wise instead of column wise, wont trash the cache so much with large strings
157  for (j = 1; j < n; ++j)
158  {
159  for (i = 1; i < m; ++i)
160  {
161  if (m_sequences->equal(i, j))
162  cost = 0;
163  else
164  cost = SequencePair::allowReplace ? 1 : 2;
165 
166  north = getContent(i, j - 1) + 1;
167  west = getContent(i - 1, j) + 1;
168  northwest = getContent(i - 1, j - 1) + cost;
169 
170  setContent(i, j, qMin(north, qMin(west, northwest)));
171  }
172  }
173 
174  return getContent(m - 1, n - 1);
175 }
176 
177 template<class SequencePair> int LevenshteinTable<SequencePair>::chooseRoute(int c1, int c2, int c3, int current)
178 {
179 // qCDebug(LIBKOMPAREDIFF2) << "c1 = " << c1 << ", c2 = " << c2 << ", c3 = " << c3;
180  // preference order: c2, c3, c1, hopefully this will work out for me
181  if (c2 <= c1 && c2 <= c3)
182  {
183  if (SequencePair::allowReplace || (c2 == current))
184  {
185  return 1;
186  }
187  }
188 
189  if (c3 <= c2 && c3 <= c1)
190  return 2;
191 
192  return 0;
193 }
194 
195 template<class SequencePair> void LevenshteinTable<SequencePair>::createListsOfMarkers()
196 {
197 // qCDebug(LIBKOMPAREDIFF2) << source;
198 // qCDebug(LIBKOMPAREDIFF2) << destination;
199 // dumpLevenshteinTable();
200 
201  unsigned int x = m_width - 1;
202  unsigned int y = m_height - 1;
203 
204  unsigned int difference = getContent(x, y);
205 
206  // If the number of differences is more than half the length of the largest string
207  // don't bother to mark the individual changes
208  // Patch based on work by Felix Berger as put as attachment to bug 75794
209  if (!m_sequences->needFineGrainedOutput(difference))
210  {
211  m_sequences->prependFirst(new Marker(Marker::End, x));
212  m_sequences->prependFirst(new Marker(Marker::Start, 0));
213  m_sequences->prependSecond(new Marker(Marker::End, y));
214  m_sequences->prependSecond(new Marker(Marker::Start, 0));
215  return;
216  }
217 
218  Marker* c = nullptr;
219 
220  int n, nw, w, direction, currentValue;
221  while (x > 0 && y > 0)
222  {
223  currentValue = getContent(x, y);
224 
225  n = getContent(x, y - 1);
226  w = getContent(x - 1, y);
227  nw = getContent(x - 1, y - 1);
228  direction = chooseRoute(n, nw, w, currentValue);
229 
230  switch (direction)
231  {
232  case 0: // north
233 // qCDebug(LIBKOMPAREDIFF2) << "Picking north";
234 // qCDebug(LIBKOMPAREDIFF2) << "Source[" << ( x - 1 ) << "] = " << QString( source[ x-1 ] ) << ", destination[" << ( y - 1 ) << "] = " << QString( destination[ y-1 ] );
235 
236  if (!m_sequences->markerListSecond().isEmpty())
237  c = m_sequences->markerListSecond().first();
238  else
239  c = nullptr;
240 
241  if (c && c->type() == Marker::End)
242  {
243 // qCDebug(LIBKOMPAREDIFF2) << "CurrentValue: " << currentValue;
244  if (n == currentValue)
245  m_sequences->prependSecond(new Marker(Marker::Start, y));
246 // else: the change continues, do not do anything
247  }
248  else
249  {
250 // qCDebug(LIBKOMPAREDIFF2) << "CurrentValue: " << currentValue;
251  if (n < currentValue)
252  m_sequences->prependSecond(new Marker(Marker::End, y));
253  }
254 
255  --y;
256  if (y == 0) {
257  m_sequences->prependSecond(new Marker(Marker::Start, 0));
258  }
259  break;
260  case 1: // northwest
261 // qCDebug(LIBKOMPAREDIFF2) << "Picking northwest";
262 // qCDebug(LIBKOMPAREDIFF2) << "Source[" << ( x - 1 ) << "] = " << QString( source[ x-1 ] ) << ", destination[" << ( y - 1 ) << "] = " << QString( destination[ y-1 ] );
263 
264  if (!m_sequences->markerListSecond().isEmpty())
265  c = m_sequences->markerListSecond().first();
266  else
267  c = nullptr;
268 
269  if (c && c->type() == Marker::End)
270  {
271 // qCDebug(LIBKOMPAREDIFF2) << "End found: CurrentValue: " << currentValue;
272  if (nw == currentValue)
273  m_sequences->prependSecond(new Marker(Marker::Start, y));
274  // else: the change continues, do not do anything
275  }
276  else
277  {
278 // qCDebug(LIBKOMPAREDIFF2) << "CurrentValue: " << currentValue;
279  if (nw < currentValue)
280  m_sequences->prependSecond(new Marker(Marker::End, y));
281  }
282 
283  if (!m_sequences->markerListFirst().isEmpty())
284  c = m_sequences->markerListFirst().first();
285  else
286  c = nullptr;
287 
288  if (c && c->type() == Marker::End)
289  {
290 // qCDebug(LIBKOMPAREDIFF2) << "End found: CurrentValue: " << currentValue;
291  if (nw == currentValue)
292  m_sequences->prependFirst(new Marker(Marker::Start, x));
293  // else: the change continues, do not do anything
294  }
295  else
296  {
297 // qCDebug(LIBKOMPAREDIFF2) << "CurrentValue: " << currentValue;
298  if (nw < currentValue)
299  m_sequences->prependFirst(new Marker(Marker::End, x));
300  }
301 
302  --y;
303  --x;
304  break;
305  case 2: // west
306 // qCDebug(LIBKOMPAREDIFF2) << "Picking west";
307 // qCDebug(LIBKOMPAREDIFF2) << "Source[" << ( x - 1 ) << "] = " << QString( source[ x-1 ] ) << ", destination[" << ( y - 1 ) << "] = " << QString( destination[ y-1 ] );
308 
309  if (!m_sequences->markerListFirst().isEmpty())
310  c = m_sequences->markerListFirst().first();
311  else
312  c = nullptr;
313 
314  if (c && c->type() == Marker::End)
315  {
316 // qCDebug(LIBKOMPAREDIFF2) << "End found: CurrentValue: " << currentValue;
317  if (w == currentValue)
318  m_sequences->prependFirst(new Marker(Marker::Start, x));
319  // else: the change continues, do not do anything
320  }
321  else
322  {
323 // qCDebug(LIBKOMPAREDIFF2) << "CurrentValue: " << currentValue;
324  if (w < currentValue)
325  m_sequences->prependFirst(new Marker(Marker::End, x));
326  }
327 
328  --x;
329  if (x == 0) {
330  m_sequences->prependFirst(new Marker(Marker::Start, 0));
331  }
332  break;
333  }
334  }
335 
336  // When leaving the loop it does not mean both are 0! If not there is still a change at the beginning of the line we missed so adding now.
337  if (x != 0)
338  {
339  m_sequences->prependFirst(new Marker(Marker::End, x));
340  m_sequences->prependFirst(new Marker(Marker::Start, 0));
341  }
342 
343  if (y != 0)
344  {
345  m_sequences->prependSecond(new Marker(Marker::End, y));
346  m_sequences->prependSecond(new Marker(Marker::Start, 0));
347  }
348 
349 // qCDebug(LIBKOMPAREDIFF2) << "Source string: " << source;
350 
351 // QStringList list;
352 // int prevValue = 0;
353 // MarkerListConstIterator mit = m_sequences->markerListFirst().begin();
354 // MarkerListConstIterator end = m_sequences->markerListFirst().end();
355 // for ( ; mit != end; ++mit )
356 // {
357 // c = *mit;
358 // qCDebug(LIBKOMPAREDIFF2) << "Source Marker Entry : Type: " << c->type() << ", Offset: " << c->offset();
359 // list.append( source.mid( prevValue, c->offset() - prevValue ) );
360 // prevValue = c->offset();
361 // }
362 // if ( prevValue < source.length() - 1 )
363 // {
364 // list.append( source.mid( prevValue, source.length() - prevValue ) );
365 // }
366 // qCDebug(LIBKOMPAREDIFF2) << "Source Resulting stringlist : " << list.join("\n");
367 //
368 // list.clear();
369 // prevValue = 0;
370 //
371 // qCDebug(LIBKOMPAREDIFF2) << "Destination string: " << destination;
372 // mit = m_sequences->markerListSecond().begin();
373 // end = m_sequences->markerListSecond().end();
374 // for ( ; mit != end; ++mit )
375 // {
376 // c = *mit;
377 // qCDebug(LIBKOMPAREDIFF2) << "Destination Marker Entry : Type: " << c->type() << ", Offset: " << c->offset();
378 // list.append( destination.mid( prevValue, c->offset() - prevValue ) );
379 // prevValue = c->offset();
380 // }
381 // if ( prevValue < destination.length() - 1 )
382 // {
383 // list.append( destination.mid( prevValue, destination.length() - prevValue ) );
384 // }
385 // qCDebug(LIBKOMPAREDIFF2) << "Destination Resulting string : " << list.join("\n");
386 }
387 
388 } // namespace Diff2
389 
390 #endif // LEVENSHTEIN_H
Diff2 namespace.
Definition: cvsdiffparser.h:12
void dumpLevenshteinTable(void)
Debug method to check if the table is properly filled.
unsigned int createTable(SequencePair *sequences)
This calculates the levenshtein distance of 2 sequences.
Computes the Levenshtein distance between two sequences.
This file is part of the KDE documentation.
Documentation copyright © 1996-2022 The KDE developers.
Generated on Sun Nov 27 2022 03:52:45 by doxygen 1.8.17 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.