Source code: com/puppycrawl/tools/checkstyle/checks/duplicates/StrictDuplicateCodeCheck.java
1 ////////////////////////////////////////////////////////////////////////////////
2 // checkstyle: Checks Java source code for adherence to a set of rules.
3 // Copyright (C) 2001-2003 Oliver Burn
4 //
5 // This library is free software; you can redistribute it and/or
6 // modify it under the terms of the GNU Lesser General Public
7 // License as published by the Free Software Foundation; either
8 // version 2.1 of the License, or (at your option) any later version.
9 //
10 // This library is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 // Lesser General Public License for more details.
14 //
15 // You should have received a copy of the GNU Lesser General Public
16 // License along with this library; if not, write to the Free Software
17 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 ////////////////////////////////////////////////////////////////////////////////
19 package com.puppycrawl.tools.checkstyle.checks.duplicates;
20
21 import java.io.File;
22 import java.io.IOException;
23 import java.util.Arrays;
24
25 import com.puppycrawl.tools.checkstyle.api.AbstractFileSetCheck;
26 import com.puppycrawl.tools.checkstyle.api.Utils;
27 import com.puppycrawl.tools.checkstyle.api.MessageDispatcher;
28 import org.apache.commons.logging.Log;
29 import org.apache.commons.logging.LogFactory;
30
31 /**
32 * Checks for duplicate code.
33 *
34 * <p>
35 * There are many approaches for detecting duplicate code. Some involve
36 * parsing a file of a programming language and analyzing the source trees
37 * of all files. This is a very powerful approach for a specific programming
38 * language (such as Java), as it can potentially even detect duplicate code
39 * where linebreaks have been changed, variables have been renamed, etc.
40 * </p>
41 * <p>
42 * This copy and paste detection implementation works differently.
43 * It cannot detect copy and paste code where the author deliberately
44 * tries to hide his copy+paste action. Instead it focusses on the standard
45 * corporate problem of reuse by copy and paste. Usually this leaves linebreaks
46 * and variable names intact. Since we do not need to analyse a parse tree
47 * our tool is not tied to a particular programming language.
48 * </p>
49 * <p>
50 * <a href="http://www.redhillconsulting.com.au/products/simian/">Simian</a>
51 * is a very good commercial duplicate code detection tool. It comes with
52 * a Checkstyle module, so we encourage all users to evaluate Simian
53 * as an alternative to this check.
54 * </p>
55 *
56 * @author Lars Kühne
57 */
58 public final class StrictDuplicateCodeCheck extends AbstractFileSetCheck
59 {
60 /**
61 * Converts each of the original source lines
62 * to a checksum that is checked against to find duplicates.
63 */
64 private interface ChecksumGenerator
65 {
66 /**
67 * Convert each of the original source lines
68 * to a checksum that is checked against to find duplicates
69 * Typically this involves stripping whitespace.
70 * @param aOriginalLines the original lines as they appear in the source
71 * @return the converted line or null if the line should be ignored
72 */
73 long[] convertLines(String[] aOriginalLines);
74 }
75
76
77 /**
78 * Calculates checksums for text files.
79 * Removes leading and trainling whitespace.
80 */
81 private class TextfileChecksumGenerator implements ChecksumGenerator
82 {
83 /** @see ChecksumGenerator#convertLines */
84 public long[] convertLines(String[] aOriginalLines)
85 {
86 long[] checkSums = new long[aOriginalLines.length];
87 for (int i = 0; i < aOriginalLines.length; i++) {
88 String line = aOriginalLines[i].trim();
89 checkSums[i] = calcChecksum(line);
90 }
91 return checkSums;
92 }
93
94 /**
95 * Computes a checksum for a a single line of text.
96 * @param aLine the aLine
97 * @return checksum
98 */
99 protected long calcChecksum(String aLine)
100 {
101 // important that it's larger than the length of most lines
102 // see http://www.utm.edu/research/primes/lists/small/1000.txt
103 final int bigPrime = 317;
104
105 // TODO: Not sure that this algorithm makes it
106 // sufficiently improbable to get false alarms
107 long result = 0;
108 for (int i = 0; i < aLine.length(); i++) {
109 long c = aLine.charAt(i);
110 long idx = i;
111 result += bigPrime * idx + c;
112 }
113 return result;
114 }
115 }
116
117 /**
118 * A TextfileChecksumGenerator that also ignores imports.
119 */
120 private class JavaChecksumGenerator extends TextfileChecksumGenerator
121 {
122 // TODO: return IGNORE for lines in the header comment?
123 // That would require some simple parsing...
124
125 // we could also parse the java code using the TreeWalker
126 // and then ignore everything before the CLASS_DEF...
127
128
129 /**
130 * computes a checksum for a aLine. to avoid false alarms it is
131 * important that different lines result in different checksums.
132 * @param aLine the aLine
133 * @return checksum
134 */
135 protected long calcChecksum(String aLine)
136 {
137 if (aLine.startsWith("import ")) {
138 return IGNORE;
139 }
140 else {
141 return super.calcChecksum(aLine);
142 }
143 }
144
145 }
146
147 /** a jakarta commons log */
148 private static final Log LOG =
149 LogFactory.getLog(StrictDuplicateCodeCheck.class);
150
151 /** the checksum value to use for lines that should be ignored */
152 private static final long IGNORE = Long.MIN_VALUE;
153
154 /** default value for mMin */
155 private static final int DEFAULT_MIN_DUPLICATE_LINES = 12;
156
157 /** number of lines that have to be idential for reporting duplicates */
158 private int mMin = DEFAULT_MIN_DUPLICATE_LINES;
159
160 /** the basedir to strip off in filenames */
161 private String mBasedir;
162
163 /** the checksums of all files that are currently checked */
164 private long[][] mLineChecksums;
165
166 /** helper to speed up searching algorithm */
167 private long[][] mSortedRelevantChecksums;
168
169 /** files that are currently checked */
170 private File[] mFiles;
171
172 // fields required only for statistics
173
174 /** total number of duplicates found */
175 private int mDuplicates;
176
177 /** lines of code that have been checked */
178 private int mLoc;
179
180 /** number of chache misses */
181 private long mCacheMisses;
182
183 /** number of cache hits */
184 private long mCacheHits;
185
186 /** Creates a new instance of this class. */
187 public StrictDuplicateCodeCheck()
188 {
189 }
190
191 /**
192 * Sets the minimum number of lines that must be equivalent
193 * before the check complains.
194 *
195 * @param aMin the number of lines that must be equal before
196 * triggering a 'duplicate code' message.
197 */
198 public void setMin(int aMin)
199 {
200 mMin = aMin;
201 }
202
203 /** @param aBasedir the base directory to strip off in filenames */
204 public void setBasedir(String aBasedir)
205 {
206 mBasedir = aBasedir;
207 }
208
209 /**
210 * @see com.puppycrawl.tools.checkstyle.api.AbstractFileSetCheck#process
211 */
212 public synchronized void process(File[] aFiles)
213 {
214 long start = System.currentTimeMillis();
215 mLoc = 0;
216 mDuplicates = 0;
217 mFiles = filter(aFiles);
218 mLineChecksums = new long[mFiles.length][];
219 mSortedRelevantChecksums = new long[mFiles.length][];
220
221 if (LOG.isDebugEnabled()) {
222 LOG.debug("Reading input files");
223 }
224
225 for (int i = 0; i < mFiles.length; i++) {
226 try {
227 File file = mFiles[i];
228 String[] lines = Utils.getLines(file.getPath());
229 ChecksumGenerator transformer = findChecksumGenerator(file);
230 mLineChecksums[i] = transformer.convertLines(lines);
231 }
232 catch (IOException ex) {
233 LOG.error("Cannot access files to check, giving up: "
234 + ex.getMessage(), ex);
235 // TODO: better to throw an exception here?
236 // it would be best to throw IOException from process(),
237 // but interface definition doesn't allow that...
238 mLineChecksums = new long[0][0];
239 }
240 }
241 fillSortedRelevantChecksums();
242
243 long endReading = System.currentTimeMillis();
244 findDuplicates();
245 long endSearching = System.currentTimeMillis();
246
247 dumpStats(start, endReading, endSearching);
248
249 mLineChecksums = null;
250 mSortedRelevantChecksums = null;
251 }
252
253 /**
254 * Finds the Checksum generator for a given file.
255 *
256 * @param aFile the file to check for duplicates
257 * @return a generator to use for aFile
258 */
259 private ChecksumGenerator findChecksumGenerator(File aFile)
260 {
261 if (aFile.getName().endsWith(".java")) {
262 return new JavaChecksumGenerator();
263 }
264 else {
265 // TODO: Not sure what to do with binary files such as gifs
266 return new TextfileChecksumGenerator();
267 }
268 }
269
270 /**
271 * Dump out statistics data on stderr.
272 * @param aStart start time
273 * @param aEndReading end time of reading phsical files
274 * @param aEndSearching end time duplicate analysis
275 */
276 private void dumpStats(long aStart, long aEndReading, long aEndSearching)
277 {
278 if (LOG.isDebugEnabled()) {
279 final long cacheLookups = mCacheHits + mCacheMisses;
280 final long initTime = aEndReading - aStart;
281 final long workTime = aEndSearching - aEndReading;
282 LOG.debug("cache hits = " + mCacheHits + "/" + cacheLookups);
283 LOG.debug("files = " + mFiles.length);
284 LOG.debug("loc = " + mLoc);
285 LOG.debug("duplicates = " + mDuplicates);
286 LOG.debug("Runtime = " + initTime + " + " + workTime);
287 }
288 }
289
290 /**
291 * filters and sorts the relevant lines and stores the result
292 * in sortedRelevantChecksums during the setup phase.
293 * That data is later used in a binary search to find out
294 * if it is worth investigating a file for duplicates of a block.
295 * If one of the lines in the block does not occur in the other file
296 * at all, we can skip that file quickly.
297 */
298 private void fillSortedRelevantChecksums()
299 {
300 for (int i = 0; i < mLineChecksums.length; i++) {
301 int count = 0;
302 long[] checksums = mLineChecksums[i];
303 long[] relevant = new long[checksums.length];
304 for (int j = 0; j < checksums.length; j++) {
305 long checksum = checksums[j];
306 if (checksum != IGNORE) {
307 relevant[count++] = checksum;
308 }
309 }
310 Arrays.sort(relevant, 0, count);
311 long[] result = new long[count];
312 System.arraycopy(relevant, 0, result, 0, count);
313 mSortedRelevantChecksums[i] = result;
314 }
315 }
316
317 /**
318 * finds duplicate lines in mFiles,
319 * using a textsearch algorithm to find reoccuring
320 * patters in the lineChecksums.
321 */
322 private void findDuplicates()
323 {
324 if (LOG.isDebugEnabled()) {
325 LOG.debug("Analysis phase");
326 }
327
328 // It's been a while since my CS degree, but I think this is
329 // somewhere near O(mMax * LOC^2).
330
331 // It may be possible to do this *much* smarter,
332 // but I don't have the Knuth bible at hand right now :-)
333
334 // OK, prepare for some nested loops... :-(
335
336 for (int i = 0; i < mFiles.length; i++) {
337
338 final String path = mFiles[i].getPath();
339
340 getMessageCollector().reset();
341 MessageDispatcher dispatcher = getMessageDispatcher();
342 dispatcher.fireFileStarted(path);
343
344 mLoc += mLineChecksums[i].length;
345 for (int j = 0; j < i; j++) {
346 findDuplicatesInFiles(i, j);
347 }
348
349 fireErrors(path);
350 dispatcher.fireFileFinished(path);
351 }
352 }
353
354 /**
355 * Compare two files and search for duplicates.
356 * @param aI mLineChecksums index of the first file to compare
357 * @param aJ mLineChecksums index of the seconds file to compare
358 */
359 private void findDuplicatesInFiles(int aI, int aJ)
360 {
361 final int iFileLength = mLineChecksums[aI].length;
362
363 // build up some supporting data structures
364 final boolean[] iLineOccurInJ = new boolean[iFileLength];
365 for (int iLine = 0; iLine < iFileLength; iLine++) {
366 iLineOccurInJ[iLine] = (Arrays.binarySearch(
367 mSortedRelevantChecksums[aJ], mLineChecksums[aI][iLine]) >= 0);
368 }
369
370 // go through all the lines in iFile and check if the following
371 // mMin lines occur in jFile
372 for (int iLine = 0; iLine < iFileLength - mMin; iLine++) {
373
374 // fast exit if one of the lines does not occur in jFile at all
375 boolean fastExit = false;
376 final int kLimit = iFileLength - iLine;
377 for (int k = 0; k < Math.min(mMin, kLimit); k++) {
378 if (!iLineOccurInJ[iLine + k]) {
379 fastExit = true;
380 break;
381 }
382 }
383
384 if (!fastExit) {
385 // all lines do occur -> brute force searching
386 mCacheMisses += 1;
387 iLine = findDuplicateFromLine(aI, aJ, iLine);
388 }
389 else {
390 mCacheHits += 1;
391 }
392 }
393 }
394
395 /**
396 * Find and report a duplicate of the code starting from line aILine
397 * in file aI in the file aJ
398 * @param aI index of file that contains the candidate code
399 * @param aJ index of file that is searched for a dup of the candidate
400 * @param aILine starting line of the candidate in aI
401 * @return the next line in file i where
402 * starting to search will make sense
403 */
404 private int findDuplicateFromLine(int aI, int aJ, int aILine)
405 {
406 // Using something more advanced like Boyer-Moore might be a
407 // good idea...
408
409 final int iFileLength = mLineChecksums[aI].length;
410 final int jFileLength = mLineChecksums[aJ].length;
411
412 for (int jLine = 0; jLine < jFileLength - mMin; jLine++) {
413 int equivalent = 0;
414 while (aILine + equivalent < iFileLength
415 && jLine + equivalent < jFileLength
416 && mLineChecksums[aI][aILine + equivalent] != IGNORE
417 && mLineChecksums[aI][aILine + equivalent]
418 == mLineChecksums[aJ][jLine + equivalent])
419 {
420 equivalent += 1;
421 }
422 if ((aI != aJ || aILine != jLine) && equivalent >= mMin) {
423 reportDuplicate(
424 equivalent, aILine, mFiles[aJ], jLine);
425 aILine += equivalent; // skip to end of equivalent section
426 }
427 }
428 return aILine;
429 }
430
431 /**
432 * Dumps out a duplicate report.
433 * @param aEquivalent number of equivalent lines
434 * @param aILine location of duplicate code
435 * within file that is currently checked
436 * @param aJFile the other file that contains the duplicate
437 * @param aJLine location of duplicate code within aJFile
438 */
439 private void reportDuplicate(
440 int aEquivalent, int aILine, File aJFile, int aJLine)
441 {
442 final Integer dupLines = new Integer(aEquivalent);
443 final Integer startLine = new Integer(aJLine + 1);
444 final String fileName =
445 Utils.getStrippedFileName(mBasedir, aJFile.getPath());
446 log(aILine + 1, "duplicates.lines",
447 new Object[]{dupLines, fileName, startLine});
448 mDuplicates += 1;
449 }
450
451 }