|
|
@@ -14,6 +14,12 @@ import sys
|
|
14
|
14
|
import time
|
|
15
|
15
|
import zlib
|
|
16
|
16
|
|
|
|
17
|
+if sys.version_info < (2, 6):
|
|
|
18
|
+ print >> sys.stderr, "Python 2.6 or higher is required for gitstats"
|
|
|
19
|
+ sys.exit(1)
|
|
|
20
|
+
|
|
|
21
|
+from multiprocessing import Pool
|
|
|
22
|
+
|
|
17
|
23
|
os.environ['LC_ALL'] = 'C'
|
|
18
|
24
|
|
|
19
|
25
|
GNUPLOT_COMMON = 'set terminal png transparent size 640,240\nset size 1.0,1.0\n'
|
|
|
@@ -104,6 +110,20 @@ def getgitversion():
|
|
104
|
110
|
def getgnuplotversion():
|
|
105
|
111
|
return getpipeoutput(['%s --version' % gnuplot_cmd]).split('\n')[0]
|
|
106
|
112
|
|
|
|
113
|
+def getnumoffilesfromrev(time_rev):
|
|
|
114
|
+ """
|
|
|
115
|
+ Get number of files changed in commit
|
|
|
116
|
+ """
|
|
|
117
|
+ time, rev = time_rev
|
|
|
118
|
+ return (int(time), rev, int(getpipeoutput(['git ls-tree -r --name-only "%s"' % rev, 'wc -l']).split('\n')[0]))
|
|
|
119
|
+
|
|
|
120
|
+def getnumoflinesinblob(ext_blob):
|
|
|
121
|
+ """
|
|
|
122
|
+ Get number of lines in blob
|
|
|
123
|
+ """
|
|
|
124
|
+ ext, blob_id = ext_blob
|
|
|
125
|
+ return (ext, blob_id, int(getpipeoutput(['git cat-file blob %s' % blob_id, 'wc -l']).split()[0]))
|
|
|
126
|
+
|
|
107
|
127
|
class DataCollector:
|
|
108
|
128
|
"""Manages data collection from a revision control repository."""
|
|
109
|
129
|
def __init__(self):
|
|
|
@@ -408,14 +428,34 @@ class GitDataCollector(DataCollector):
|
|
408
|
428
|
# timezone
|
|
409
|
429
|
self.commits_by_timezone[timezone] = self.commits_by_timezone.get(timezone, 0) + 1
|
|
410
|
430
|
|
|
411
|
|
- # TODO Optimize this, it's the worst bottleneck
|
|
412
|
431
|
# outputs "<stamp> <files>" for each revision
|
|
413
|
432
|
revlines = getpipeoutput(['git rev-list --pretty=format:"%%at %%T" %s' % getcommitrange('HEAD'), 'grep -v ^commit']).strip().split('\n')
|
|
414
|
433
|
lines = []
|
|
|
434
|
+ revs_to_read = []
|
|
|
435
|
+ time_rev_count = []
|
|
|
436
|
+ #Look up rev in cache and take info from cache if found
|
|
|
437
|
+ #If not append rev to list of rev to read from repo
|
|
415
|
438
|
for revline in revlines:
|
|
416
|
439
|
time, rev = revline.split(' ')
|
|
417
|
|
- linecount = self.getFilesInCommit(rev)
|
|
418
|
|
- lines.append('%d %d' % (int(time), linecount))
|
|
|
440
|
+ #if cache empty then add time and rev to list of new rev's
|
|
|
441
|
+ #otherwise try to read needed info from cache
|
|
|
442
|
+ if 'files_in_tree' not in self.cache.keys():
|
|
|
443
|
+ revs_to_read.append((time,rev))
|
|
|
444
|
+ continue
|
|
|
445
|
+ if rev in self.cache['files_in_tree'].keys():
|
|
|
446
|
+ lines.append('%d %d' % (int(time), self.cache['files_in_tree'][rev]))
|
|
|
447
|
+ else:
|
|
|
448
|
+ revs_to_read.append((time,rev))
|
|
|
449
|
+
|
|
|
450
|
+ #Read revisions from repo
|
|
|
451
|
+ time_rev_count = Pool(processes=24).map(getnumoffilesfromrev, revs_to_read)
|
|
|
452
|
+
|
|
|
453
|
+ #Update cache with new revisions and append then to general list
|
|
|
454
|
+ for (time, rev, count) in time_rev_count:
|
|
|
455
|
+ if 'files_in_tree' not in self.cache:
|
|
|
456
|
+ self.cache['files_in_tree'] = {}
|
|
|
457
|
+ self.cache['files_in_tree'][rev] = count
|
|
|
458
|
+ lines.append('%d %d' % (int(time), count))
|
|
419
|
459
|
|
|
420
|
460
|
self.total_commits += len(lines)
|
|
421
|
461
|
for line in lines:
|
|
|
@@ -430,6 +470,7 @@ class GitDataCollector(DataCollector):
|
|
430
|
470
|
|
|
431
|
471
|
# extensions and size of files
|
|
432
|
472
|
lines = getpipeoutput(['git ls-tree -r -l -z %s' % getcommitrange('HEAD', end_only = True)]).split('\000')
|
|
|
473
|
+ blobs_to_read = []
|
|
433
|
474
|
for line in lines:
|
|
434
|
475
|
if len(line) == 0:
|
|
435
|
476
|
continue
|
|
|
@@ -437,7 +478,7 @@ class GitDataCollector(DataCollector):
|
|
437
|
478
|
if parts[0] == '160000' and parts[3] == '-':
|
|
438
|
479
|
# skip submodules
|
|
439
|
480
|
continue
|
|
440
|
|
- sha1 = parts[2]
|
|
|
481
|
+ blob_id = parts[2]
|
|
441
|
482
|
size = int(parts[3])
|
|
442
|
483
|
fullpath = parts[4]
|
|
443
|
484
|
|
|
|
@@ -451,15 +492,28 @@ class GitDataCollector(DataCollector):
|
|
451
|
492
|
ext = filename[(filename.rfind('.') + 1):]
|
|
452
|
493
|
if len(ext) > conf['max_ext_length']:
|
|
453
|
494
|
ext = ''
|
|
454
|
|
-
|
|
455
|
495
|
if ext not in self.extensions:
|
|
456
|
496
|
self.extensions[ext] = {'files': 0, 'lines': 0}
|
|
457
|
|
-
|
|
458
|
497
|
self.extensions[ext]['files'] += 1
|
|
459
|
|
- try:
|
|
460
|
|
- self.extensions[ext]['lines'] += self.getLinesInBlob(sha1)
|
|
461
|
|
- except:
|
|
462
|
|
- print 'Warning: Could not count lines for file "%s"' % line
|
|
|
498
|
+ #if cache empty then add ext and blob id to list of new blob's
|
|
|
499
|
+ #otherwise try to read needed info from cache
|
|
|
500
|
+ if 'lines_in_blob' not in self.cache.keys():
|
|
|
501
|
+ blobs_to_read.append((ext,blob_id))
|
|
|
502
|
+ continue
|
|
|
503
|
+ if blob_id in self.cache['lines_in_blob'].keys():
|
|
|
504
|
+ self.extensions[ext]['lines'] += self.cache['lines_in_blob'][blob_id]
|
|
|
505
|
+ else:
|
|
|
506
|
+ blobs_to_read.append((ext,blob_id))
|
|
|
507
|
+
|
|
|
508
|
+ #Get info abount line count for new blob's that wasn't found in cache
|
|
|
509
|
+ ext_blob_linecount = Pool(processes=24).map(getnumoflinesinblob, blobs_to_read)
|
|
|
510
|
+
|
|
|
511
|
+ #Update cache and write down info about number of number of lines
|
|
|
512
|
+ for (ext, blob_id, linecount) in ext_blob_linecount:
|
|
|
513
|
+ if 'lines_in_blob' not in self.cache:
|
|
|
514
|
+ self.cache['lines_in_blob'] = {}
|
|
|
515
|
+ self.cache['lines_in_blob'][blob_id] = linecount
|
|
|
516
|
+ self.extensions[ext]['lines'] += self.cache['lines_in_blob'][blob_id]
|
|
463
|
517
|
|
|
464
|
518
|
# line statistics
|
|
465
|
519
|
# outputs:
|
|
|
@@ -619,33 +673,12 @@ class GitDataCollector(DataCollector):
|
|
619
|
673
|
def getDomains(self):
|
|
620
|
674
|
return self.domains.keys()
|
|
621
|
675
|
|
|
622
|
|
- def getFilesInCommit(self, rev):
|
|
623
|
|
- try:
|
|
624
|
|
- res = self.cache['files_in_tree'][rev]
|
|
625
|
|
- except:
|
|
626
|
|
- res = int(getpipeoutput(['git ls-tree -r --name-only "%s"' % rev, 'wc -l']).split('\n')[0])
|
|
627
|
|
- if 'files_in_tree' not in self.cache:
|
|
628
|
|
- self.cache['files_in_tree'] = {}
|
|
629
|
|
- self.cache['files_in_tree'][rev] = res
|
|
630
|
|
-
|
|
631
|
|
- return res
|
|
632
|
|
-
|
|
633
|
676
|
def getFirstCommitDate(self):
|
|
634
|
677
|
return datetime.datetime.fromtimestamp(self.first_commit_stamp)
|
|
635
|
678
|
|
|
636
|
679
|
def getLastCommitDate(self):
|
|
637
|
680
|
return datetime.datetime.fromtimestamp(self.last_commit_stamp)
|
|
638
|
681
|
|
|
639
|
|
- def getLinesInBlob(self, sha1):
|
|
640
|
|
- try:
|
|
641
|
|
- res = self.cache['lines_in_blob'][sha1]
|
|
642
|
|
- except:
|
|
643
|
|
- res = int(getpipeoutput(['git cat-file blob %s' % sha1, 'wc -l']).split()[0])
|
|
644
|
|
- if 'lines_in_blob' not in self.cache:
|
|
645
|
|
- self.cache['lines_in_blob'] = {}
|
|
646
|
|
- self.cache['lines_in_blob'][sha1] = res
|
|
647
|
|
- return res
|
|
648
|
|
-
|
|
649
|
682
|
def getTags(self):
|
|
650
|
683
|
lines = getpipeoutput(['git show-ref --tags', 'cut -d/ -f3'])
|
|
651
|
684
|
return lines.split('\n')
|