KMime

kmime_charfreq.h
Go to the documentation of this file.
1 /* -*- c++ -*-
2  kmime_charfreq.h
3 
4  KMime, the KDE Internet mail/usenet news message library.
5  SPDX-FileCopyrightText: 2001-2002 Marc Mutz <[email protected]>
6 
7  SPDX-License-Identifier: LGPL-2.0-or-later
8 */
9 /**
10  @file
11  This file is part of the API for handling @ref MIME data and
12  defines the CharFreq class.
13 
14  @brief
15  Defines the CharFreq class.
16 
17  @authors Marc Mutz <[email protected]>
18 
19  @glossary @anchor Eight-Bit @anchor eight-bit @b 8-bit:
20  Data that contains bytes with at least one value greater than 127, or at
21  least one NUL byte.
22 
23  @glossary @anchor Eight-Bit-Binary @anchor eight-bit-binary @b 8-bit-binary:
24  Eight-bit data that contains a high percentage of non-ascii values,
25  or lines longer than 998 characters, or stray CRs, or NULs.
26 
27  @glossary @anchor Eight-Bit-Text @anchor eight-bit-text @b 8-bit-text:
28  Eight-bit data that contains a high percentage of ascii values,
29  no lines longer than 998 characters, no NULs, and either only LFs or
30  only CRLFs.
31 
32  @glossary @anchor Seven-Bit @anchor seven-bit @b 7-Bit:
33  Data that contains bytes with all values less than 128, and no NULs.
34 
35  @glossary @anchor Seven-Bit-Binary @anchor seven-bit-binary @b 7-bit-binary:
36  Seven-bit data that contains a high percentage of non-ascii values,
37  or lines longer than 998 characters, or stray CRs.
38 
39  @glossary @anchor Seven-Bit-Text @anchor seven-bit-text @b 7-bit-text:
40  Seven-bit data that contains a high percentage of ascii values,
41  no lines longer than 998 characters, and either only LFs, or only CRLFs.
42 */
43 
44 #pragma once
45 
46 #include <QByteArray>
47 #undef None
48 
49 namespace KMime
50 {
51 
52 /**
53  @brief
54  A class for performing basic data typing using frequency count heuristics.
55 
56  This class performs character frequency counts on the provided data which
57  are used in heuristics to determine a basic data type. The data types are:
58 
59  - @ref Eight-Bit-Binary
60  - @ref Eight-Bit-Text
61  - @ref Seven-Bit-Binary
62  - @ref Seven-Bit-Text
63 */
64 class CharFreq
65 {
66 public:
67  /**
68  Constructs a Character Frequency instance for a buffer @p buf of
69  QByteArray data.
70 
71  @param buf is a QByteArray containing the data.
72  */
73  explicit CharFreq(const QByteArray &buf);
74 
75  /**
76  Constructs a Character Frequency instance for a buffer @p buf of
77  chars of length @p len.
78 
79  @param buf is a pointer to a character string containing the data.
80  @param len is the length of @p buf, in characters.
81  */
82  CharFreq(const char *buf, size_t len);
83 
84  /**
85  The different types of data.
86  */
87  enum Type {
88  None = 0, /**< Unknown */
89  EightBitData, /**< 8bit binary */
90  Binary = EightBitData, /**< 8bit binary */
91  SevenBitData, /**< 7bit binary */
92  EightBitText, /**< 8bit text */
93  SevenBitText /**< 7bit text */
94  };
95 
96  /**
97  Returns the data #Type as derived from the class heuristics.
98  */
99  Q_REQUIRED_RESULT Type type() const;
100 
101  /**
102  Returns true if the data #Type is EightBitData; false otherwise.
103  */
104  Q_REQUIRED_RESULT bool isEightBitData() const;
105 
106  /**
107  Returns true if the data #Type is EightBitText; false otherwise.
108  */
109  Q_REQUIRED_RESULT bool isEightBitText() const;
110 
111  /**
112  Returns true if the data #Type is SevenBitData; false otherwise.
113  */
114  Q_REQUIRED_RESULT bool isSevenBitData() const;
115 
116  /**
117  Returns true if the data #Type is SevenBitText; false otherwise.
118  */
119  Q_REQUIRED_RESULT bool isSevenBitText() const;
120 
121  /**
122  Returns true if the data contains trailing whitespace. i.e.,
123  if any line ends with space (' ') or tab ('\\t').
124  */
125  Q_REQUIRED_RESULT bool hasTrailingWhitespace() const;
126 
127  /**
128  Returns true if the data contains a line that starts with "From ".
129  */
130  Q_REQUIRED_RESULT bool hasLeadingFrom() const;
131 
132  /**
133  Returns the percentage of printable characters in the data.
134  The result is undefined if the number of data characters is zero.
135  */
136  Q_REQUIRED_RESULT float printableRatio() const;
137 
138  /**
139  Returns the percentage of control code characters (CTLs) in the data.
140  The result is undefined if the number of data characters is zero.
141  */
142  Q_REQUIRED_RESULT float controlCodesRatio() const;
143 
144 private:
145  //@cond PRIVATE
146  uint mNUL; // count of NUL chars
147  uint mCTL; // count of CTLs (incl. DEL, excl. CR, LF, HT)
148  uint mCR; // count of CR chars
149  uint mLF; // count of LF chars
150  uint mCRLF; // count of LFs, preceded by CRs
151  uint mPrintable; // count of printable US-ASCII chars (SPC..~)
152  uint mEightBit; // count of other latin1 chars (those with 8th bit set)
153  uint mTotal; // count of all chars
154  uint mLineMin; // minimum line length
155  uint mLineMax; // maximum line length
156  bool mTrailingWS = false; // does the buffer contain trailing whitespace?
157  bool mLeadingFrom = false; // does the buffer contain lines starting with "From "?
158  //@endcond
159 
160  /**
161  Performs the character frequency counts on the data.
162 
163  @param buf is a pointer to a character string containing the data.
164  @param len is the length of @p buf, in characters.
165  */
166  void count(const char *buf, size_t len);
167 };
168 
169 } // namespace KMime
170 
bool isEightBitText() const
Returns true if the data Type is EightBitText; false otherwise.
bool isEightBitData() const
Returns true if the data Type is EightBitData; false otherwise.
bool isSevenBitText() const
Returns true if the data Type is SevenBitText; false otherwise.
bool hasTrailingWhitespace() const
Returns true if the data contains trailing whitespace.
Type
The different types of data.
bool isSevenBitData() const
Returns true if the data Type is SevenBitData; false otherwise.
float printableRatio() const
Returns the percentage of printable characters in the data.
CharFreq(const QByteArray &buf)
Constructs a Character Frequency instance for a buffer buf of QByteArray data.
bool hasLeadingFrom() const
Returns true if the data contains a line that starts with "From ".
float controlCodesRatio() const
Returns the percentage of control code characters (CTLs) in the data.
A class for performing basic data typing using frequency count heuristics.
Type type() const
Returns the data Type as derived from the class heuristics.
This file is part of the KDE documentation.
Documentation copyright © 1996-2021 The KDE developers.
Generated on Fri Jun 18 2021 23:12:50 by doxygen 1.8.11 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.