Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: com/puppycrawl/tools/checkstyle/checks/duplicates/StrictDuplicateCodeCheck.java


1   ////////////////////////////////////////////////////////////////////////////////
2   // checkstyle: Checks Java source code for adherence to a set of rules.
3   // Copyright (C) 2001-2003  Oliver Burn
4   //
5   // This library is free software; you can redistribute it and/or
6   // modify it under the terms of the GNU Lesser General Public
7   // License as published by the Free Software Foundation; either
8   // version 2.1 of the License, or (at your option) any later version.
9   //
10  // This library is distributed in the hope that it will be useful,
11  // but WITHOUT ANY WARRANTY; without even the implied warranty of
12  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  // Lesser General Public License for more details.
14  //
15  // You should have received a copy of the GNU Lesser General Public
16  // License along with this library; if not, write to the Free Software
17  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18  ////////////////////////////////////////////////////////////////////////////////
19  package com.puppycrawl.tools.checkstyle.checks.duplicates;
20  
21  import java.io.File;
22  import java.io.IOException;
23  import java.util.Arrays;
24  
25  import com.puppycrawl.tools.checkstyle.api.AbstractFileSetCheck;
26  import com.puppycrawl.tools.checkstyle.api.Utils;
27  import com.puppycrawl.tools.checkstyle.api.MessageDispatcher;
28  import org.apache.commons.logging.Log;
29  import org.apache.commons.logging.LogFactory;
30  
31  /**
32   * Checks for duplicate code.
33   *
34   * <p>
35   * There are many approaches for detecting duplicate code. Some involve
36   * parsing a file of a programming language and analyzing the source trees
37   * of all files. This is a very powerful approach for a specific programming
38   * language (such as Java), as it can potentially even detect duplicate code
39   * where linebreaks have been changed, variables have been renamed, etc.
40   * </p>
41   * <p>
42   * This copy and paste detection implementation works differently.
43   * It cannot detect copy and paste code where the author deliberately
44   * tries to hide his copy+paste action. Instead it focusses on the standard
45   * corporate problem of reuse by copy and paste. Usually this leaves linebreaks
46   * and variable names intact. Since we do not need to analyse a parse tree
47   * our tool is not tied to a particular programming language.
48   * </p>
49   * <p>
50   * <a href="http://www.redhillconsulting.com.au/products/simian/">Simian</a>
51   * is a very good commercial duplicate code detection tool. It comes with
52   * a Checkstyle module, so we encourage all users to evaluate Simian
53   * as an alternative to this check.
54   * </p>
55   *
56   * @author Lars K&uuml;hne
57   */
58  public final class StrictDuplicateCodeCheck extends AbstractFileSetCheck
59  {
60      /**
61       * Converts each of the original source lines
62       * to a checksum that is checked against to find duplicates.
63       */
64      private interface ChecksumGenerator
65      {
66          /**
67           * Convert each of the original source lines
68           * to a checksum that is checked against to find duplicates
69           * Typically this involves stripping whitespace.
70           * @param aOriginalLines the original lines as they appear in the source
71           * @return the converted line or null if the line should be ignored
72           */
73          long[] convertLines(String[] aOriginalLines);
74      }
75  
76  
77      /**
78       * Calculates checksums for text files.
79       * Removes leading and trainling whitespace.
80       */
81      private class TextfileChecksumGenerator implements ChecksumGenerator
82      {
83          /** @see ChecksumGenerator#convertLines */
84          public long[] convertLines(String[] aOriginalLines)
85          {
86              long[] checkSums = new long[aOriginalLines.length];
87              for (int i = 0; i < aOriginalLines.length; i++) {
88                  String line = aOriginalLines[i].trim();
89                  checkSums[i] = calcChecksum(line);
90              }
91              return checkSums;
92          }
93  
94          /**
95           * Computes a checksum for a a single line of text.
96           * @param aLine the aLine
97           * @return checksum
98           */
99          protected long calcChecksum(String aLine)
100         {
101             // important that it's larger than the length of most lines
102             // see http://www.utm.edu/research/primes/lists/small/1000.txt
103             final int bigPrime = 317;
104 
105             // TODO: Not sure that this algorithm makes it
106             // sufficiently improbable to get false alarms
107             long result = 0;
108             for (int i = 0; i < aLine.length(); i++) {
109                 long c = aLine.charAt(i);
110                 long idx = i;
111                 result += bigPrime * idx + c;
112             }
113             return result;
114         }
115     }
116 
117     /**
118      * A TextfileChecksumGenerator that also ignores imports.
119      */
120     private class JavaChecksumGenerator extends TextfileChecksumGenerator
121     {
122         // TODO: return IGNORE for lines in the header comment?
123         // That would require some simple parsing...
124 
125         // we could also parse the java code using the TreeWalker
126         // and then ignore everything before the CLASS_DEF...
127 
128 
129         /**
130          * computes a checksum for a aLine. to avoid false alarms it is
131          * important that different lines result in different checksums.
132          * @param aLine the aLine
133          * @return checksum
134          */
135         protected long calcChecksum(String aLine)
136         {
137             if (aLine.startsWith("import ")) {
138                 return IGNORE;
139             }
140             else {
141                 return super.calcChecksum(aLine);
142             }
143         }
144 
145     }
146 
147     /** a jakarta commons log */
148     private static final Log LOG =
149             LogFactory.getLog(StrictDuplicateCodeCheck.class);
150 
151     /** the checksum value to use for lines that should be ignored */
152     private static final long IGNORE = Long.MIN_VALUE;
153 
154     /** default value for mMin */
155     private static final int DEFAULT_MIN_DUPLICATE_LINES = 12;
156 
157     /** number of lines that have to be idential for reporting duplicates */
158     private int mMin = DEFAULT_MIN_DUPLICATE_LINES;
159 
160     /** the basedir to strip off in filenames */
161     private String mBasedir;
162 
163     /** the checksums of all files that are currently checked */
164     private long[][] mLineChecksums;
165 
166     /** helper to speed up searching algorithm */
167     private long[][] mSortedRelevantChecksums;
168 
169     /** files that are currently checked */
170     private File[] mFiles;
171 
172     // fields required only for statistics
173 
174     /** total number of duplicates found */
175     private int mDuplicates;
176 
177     /** lines of code that have been checked */
178     private int mLoc;
179 
180     /** number of chache misses */
181     private long mCacheMisses;
182 
183     /** number of cache hits */
184     private long mCacheHits;
185 
186     /** Creates a new instance of this class. */
187     public StrictDuplicateCodeCheck()
188     {
189     }
190 
191     /**
192      * Sets the minimum number of lines that must be equivalent
193      * before the check complains.
194      *
195      * @param aMin the number of lines that must be equal before
196      * triggering a 'duplicate code' message.
197      */
198     public void setMin(int aMin)
199     {
200         mMin = aMin;
201     }
202 
203     /** @param aBasedir the base directory to strip off in filenames */
204     public void setBasedir(String aBasedir)
205     {
206         mBasedir = aBasedir;
207     }
208 
209     /**
210      * @see com.puppycrawl.tools.checkstyle.api.AbstractFileSetCheck#process
211      */
212     public synchronized void process(File[] aFiles)
213     {
214         long start = System.currentTimeMillis();
215         mLoc = 0;
216         mDuplicates = 0;
217         mFiles = filter(aFiles);
218         mLineChecksums = new long[mFiles.length][];
219         mSortedRelevantChecksums = new long[mFiles.length][];
220 
221         if (LOG.isDebugEnabled()) {
222             LOG.debug("Reading input files");
223         }
224 
225         for (int i = 0; i < mFiles.length; i++) {
226             try {
227                 File file = mFiles[i];
228                 String[] lines = Utils.getLines(file.getPath());
229                 ChecksumGenerator transformer = findChecksumGenerator(file);
230                 mLineChecksums[i] = transformer.convertLines(lines);
231             }
232             catch (IOException ex) {
233                 LOG.error("Cannot access files to check, giving up: "
234                         + ex.getMessage(), ex);
235                 // TODO: better to throw an exception here?
236                 // it would be best to throw IOException from process(),
237                 // but interface definition doesn't allow that...
238                 mLineChecksums = new long[0][0];
239             }
240         }
241         fillSortedRelevantChecksums();
242 
243         long endReading = System.currentTimeMillis();
244         findDuplicates();
245         long endSearching = System.currentTimeMillis();
246 
247         dumpStats(start, endReading, endSearching);
248 
249         mLineChecksums = null;
250         mSortedRelevantChecksums = null;
251     }
252 
253     /**
254      * Finds the Checksum generator for a given file.
255      *
256      * @param aFile the file to check for duplicates
257      * @return a generator to use for aFile
258      */
259     private ChecksumGenerator findChecksumGenerator(File aFile)
260     {
261         if (aFile.getName().endsWith(".java")) {
262             return new JavaChecksumGenerator();
263         }
264         else {
265             // TODO: Not sure what to do with binary files such as gifs
266             return new TextfileChecksumGenerator();
267         }
268     }
269 
270     /**
271      * Dump out statistics data on stderr.
272      * @param aStart start time
273      * @param aEndReading end time of reading phsical files
274      * @param aEndSearching end time duplicate analysis
275      */
276     private void dumpStats(long aStart, long aEndReading, long aEndSearching)
277     {
278         if (LOG.isDebugEnabled()) {
279             final long cacheLookups = mCacheHits + mCacheMisses;
280             final long initTime = aEndReading - aStart;
281             final long workTime = aEndSearching - aEndReading;
282             LOG.debug("cache hits = " + mCacheHits + "/" + cacheLookups);
283             LOG.debug("files = " + mFiles.length);
284             LOG.debug("loc = " + mLoc);
285             LOG.debug("duplicates = " + mDuplicates);
286             LOG.debug("Runtime = " + initTime + " + " + workTime);
287         }
288     }
289 
290     /**
291      * filters and sorts the relevant lines and stores the result
292      * in sortedRelevantChecksums during the setup phase.
293      * That data is later used in a binary search to find out
294      * if it is worth investigating a file for duplicates of a block.
295      * If one of the lines in the block does not occur in the other file
296      * at all, we can skip that file quickly.
297      */
298     private void fillSortedRelevantChecksums()
299     {
300         for (int i = 0; i < mLineChecksums.length; i++) {
301             int count = 0;
302             long[] checksums = mLineChecksums[i];
303             long[] relevant = new long[checksums.length];
304             for (int j = 0; j < checksums.length; j++) {
305                 long checksum = checksums[j];
306                 if (checksum != IGNORE) {
307                     relevant[count++] = checksum;
308                 }
309             }
310             Arrays.sort(relevant, 0, count);
311             long[] result = new long[count];
312             System.arraycopy(relevant, 0, result, 0, count);
313             mSortedRelevantChecksums[i] = result;
314         }
315     }
316 
317     /**
318      * finds duplicate lines in mFiles,
319      * using a textsearch algorithm to find reoccuring
320      * patters in the lineChecksums.
321      */
322     private void findDuplicates()
323     {
324         if (LOG.isDebugEnabled()) {
325             LOG.debug("Analysis phase");
326         }
327 
328         // It's been a while since my CS degree, but I think this is
329         // somewhere near O(mMax * LOC^2).
330 
331         // It may be possible to do this *much* smarter,
332         // but I don't have the Knuth bible at hand right now :-)
333 
334         // OK, prepare for some nested loops... :-(
335 
336         for (int i = 0; i < mFiles.length; i++) {
337 
338             final String path = mFiles[i].getPath();
339 
340             getMessageCollector().reset();
341             MessageDispatcher dispatcher = getMessageDispatcher();
342             dispatcher.fireFileStarted(path);
343 
344             mLoc += mLineChecksums[i].length;
345             for (int j = 0; j < i; j++) {
346                 findDuplicatesInFiles(i, j);
347             }
348 
349             fireErrors(path);
350             dispatcher.fireFileFinished(path);
351         }
352     }
353 
354     /**
355      * Compare two files and search for duplicates.
356      * @param aI mLineChecksums index of the first file to compare
357      * @param aJ mLineChecksums index of the seconds file to compare
358      */
359     private void findDuplicatesInFiles(int aI, int aJ)
360     {
361         final int iFileLength = mLineChecksums[aI].length;
362 
363         // build up some supporting data structures
364         final boolean[] iLineOccurInJ = new boolean[iFileLength];
365         for (int iLine = 0; iLine < iFileLength; iLine++) {
366             iLineOccurInJ[iLine] = (Arrays.binarySearch(
367                 mSortedRelevantChecksums[aJ], mLineChecksums[aI][iLine]) >= 0);
368         }
369 
370         // go through all the lines in iFile and check if the following
371         // mMin lines occur in jFile
372         for (int iLine = 0; iLine < iFileLength - mMin; iLine++) {
373 
374             // fast exit if one of the lines does not occur in jFile at all
375             boolean fastExit = false;
376             final int kLimit = iFileLength - iLine;
377             for (int k = 0; k < Math.min(mMin, kLimit); k++) {
378                 if (!iLineOccurInJ[iLine + k]) {
379                     fastExit = true;
380                     break;
381                 }
382             }
383 
384             if (!fastExit) {
385                 // all lines do occur -> brute force searching
386                 mCacheMisses += 1;
387                 iLine = findDuplicateFromLine(aI, aJ, iLine);
388             }
389             else {
390                 mCacheHits += 1;
391             }
392         }
393     }
394 
395     /**
396      * Find and report a duplicate of the code starting from line aILine
397      * in file aI in the file aJ
398      * @param aI index of file that contains the candidate code
399      * @param aJ index of file that is searched for a dup of the candidate
400      * @param aILine starting line of the candidate in aI
401      * @return the next line in file i where
402      * starting to search will make sense
403      */
404     private int findDuplicateFromLine(int aI, int aJ, int aILine)
405     {
406         // Using something more advanced like Boyer-Moore might be a
407         // good idea...
408 
409         final int iFileLength = mLineChecksums[aI].length;
410         final int jFileLength = mLineChecksums[aJ].length;
411 
412         for (int jLine = 0; jLine < jFileLength - mMin; jLine++) {
413             int equivalent = 0;
414             while (aILine + equivalent < iFileLength
415                     && jLine + equivalent < jFileLength
416                     && mLineChecksums[aI][aILine + equivalent] != IGNORE
417                     && mLineChecksums[aI][aILine + equivalent]
418                        == mLineChecksums[aJ][jLine + equivalent])
419             {
420                 equivalent += 1;
421             }
422             if ((aI != aJ || aILine != jLine) && equivalent >= mMin) {
423                 reportDuplicate(
424                         equivalent, aILine, mFiles[aJ], jLine);
425                 aILine += equivalent; // skip to end of equivalent section
426             }
427         }
428         return aILine;
429     }
430 
431     /**
432      * Dumps out a duplicate report.
433      * @param aEquivalent number of equivalent lines
434      * @param aILine location of duplicate code
435      * within file that is currently checked
436      * @param aJFile the other file that contains the duplicate
437      * @param aJLine location of duplicate code within aJFile
438      */
439     private void reportDuplicate(
440             int aEquivalent, int aILine, File aJFile, int aJLine)
441     {
442         final Integer dupLines = new Integer(aEquivalent);
443         final Integer startLine = new Integer(aJLine + 1);
444         final String fileName =
445                 Utils.getStrippedFileName(mBasedir, aJFile.getPath());
446         log(aILine + 1, "duplicates.lines",
447                 new Object[]{dupLines, fileName, startLine});
448         mDuplicates += 1;
449     }
450 
451 }