Bläddra i källkod

Refactor data generators into seperate classes

Use callback pattern to handle processing of individual rows of data from data generators
Use dataclasses to define individual data instances
Dan Rapp 7 år sedan
förälder
incheckning
2047313000

+ 2
- 6
gitstats/__init__.py Visa fil

@@ -1,11 +1,7 @@
1
-from gitstats.gitstats import GitStats
1
+from gitstats.process import run
2 2
 from gitstats._version import get_versions
3 3
 __version__ = get_versions()['version']
4 4
 del get_versions
5 5
 
6
-def main():
7
-    g = GitStats()
8
-    g.run()
9
-
10 6
 if __name__ == "__main__":
11
-    main()
7
+    run()

+ 16
- 0
gitstats/cd.py Visa fil

@@ -0,0 +1,16 @@
1
+"""Holds the current working directory context class. A python version of pushd/popd"""
2
+import os
3
+
4
+# pylint: disable=too-few-public-methods
5
+class cd: # pylint: disable=invalid-name
6
+    """Context manager for changing the current working directory"""
7
+    def __init__(self, newPath):
8
+        self.new_path = os.path.expanduser(newPath)
9
+        self.saved_path = os.getcwd()
10
+
11
+    def __enter__(self):
12
+        self.saved_path = os.getcwd()
13
+        os.chdir(self.new_path)
14
+
15
+    def __exit__(self, etype, value, traceback):
16
+        os.chdir(self.saved_path)

+ 60
- 0
gitstats/cli.py Visa fil

@@ -0,0 +1,60 @@
1
+import getopt
2
+import logging
3
+import os
4
+import sys
5
+
6
+conf = {
7
+    'max_domains': 10,
8
+    'max_ext_length': 10,
9
+    'style': 'gitstats.css',
10
+    'max_authors': 20,
11
+    'authors_top': 5,
12
+    'commit_begin': '',
13
+    'commit_end': 'HEAD',
14
+    'linear_linestats': 1,
15
+    'project_name': '',
16
+    'processes': 8,
17
+    'start_date': '',
18
+    'logging': logging.INFO,
19
+}
20
+
21
+
22
+def _usage():
23
+    print(f"""
24
+Usage: gitstats [options] <gitpath..> <outputpath>
25
+
26
+Options:
27
+-c key=value     Override configuration value
28
+-n key=value     Define author name equivalency (key will treated the same as value)
29
+
30
+Default config values:
31
+{conf}
32
+
33
+Please see the manual page for more details.
34
+""")
35
+
36
+
37
+def get_cli():
38
+    optlist, args = getopt.getopt(sys.argv[1:], 'hc:', ["help"])
39
+    for o, v in optlist:
40
+        if o == '-c':
41
+            key, value = v.split('=', 1)
42
+            if key not in conf:
43
+                raise KeyError('no such key "%s" in config' % key)
44
+            if isinstance(conf[key], int):
45
+                conf[key] = int(value)
46
+            else:
47
+                conf[key] = value
48
+        elif o in ('-h', '--help'):
49
+            _usage()
50
+            sys.exit()
51
+
52
+    if len(args) < 2:
53
+        _usage()
54
+        sys.exit(0)
55
+
56
+    outputpath = os.path.abspath(args[-1])
57
+    paths = args[0:-1]
58
+    outputpath = os.path.abspath(outputpath)
59
+
60
+    return conf, paths, outputpath

+ 7
- 0
gitstats/data/__init__.py Visa fil

@@ -0,0 +1,7 @@
1
+from .author import Author
2
+from .author_row import AuthorRow
3
+from .author_totals import AuthorTotals
4
+from .tag import Tag
5
+from .revision import Revision
6
+from .file import File
7
+from .loc_by_date import LocByDate

+ 10
- 0
gitstats/data/author_row.py Visa fil

@@ -0,0 +1,10 @@
1
+from dataclasses import dataclass
2
+
3
+@dataclass
4
+class AuthorRow:
5
+    sha: str
6
+    stamp: int
7
+    author: str
8
+    files_modified: int
9
+    lines_inserted: int
10
+    lines_deleted: int

+ 6
- 0
gitstats/data/author_totals.py Visa fil

@@ -0,0 +1,6 @@
1
+from dataclasses import dataclass
2
+
3
+@dataclass
4
+class AuthorTotals:
5
+    author: str
6
+    total_commits: int

+ 10
- 0
gitstats/data/file.py Visa fil

@@ -0,0 +1,10 @@
1
+from dataclasses import dataclass
2
+
3
+@dataclass
4
+class File:
5
+    full_path: str
6
+    ext: str
7
+    size: int
8
+    lines: int = 0
9
+
10
+

+ 13
- 0
gitstats/data/loc_by_date.py Visa fil

@@ -0,0 +1,13 @@
1
+from dataclasses import dataclass
2
+
3
+
4
+@dataclass
5
+class LocByDate:
6
+    hash: str = ''
7
+    stamp: int = 0
8
+    file_count: int = 0
9
+    lines_inserted: int = 0
10
+    lines_deleted: int = 0
11
+    total_lines: int = 0
12
+
13
+

+ 16
- 0
gitstats/data/revision.py Visa fil

@@ -0,0 +1,16 @@
1
+from collections import defaultdict
2
+from dataclasses import dataclass, field
3
+from typing import Dict
4
+
5
+#    # Outputs "<stamp> <date> <time> <timezone> <author> '<' <mail> '>'"
6
+
7
+@dataclass
8
+class Revision:
9
+    sha: str
10
+    stamp: int
11
+    timezone: int = 0
12
+    author: str = ''
13
+    email: str = ''
14
+    domain: str = ''
15
+    file_count: int = 0
16
+

+ 13
- 0
gitstats/data/tag.py Visa fil

@@ -0,0 +1,13 @@
1
+from collections import defaultdict
2
+from dataclasses import dataclass, field
3
+from typing import Dict
4
+
5
+@dataclass
6
+class Tag:
7
+    tag: str
8
+    stamp: int
9
+    hash: str
10
+    commits: int = 0
11
+    authors: Dict[str, int] = field(default_factory=defaultdict(int))
12
+
13
+

+ 6
- 0
gitstats/data_generators/__init__.py Visa fil

@@ -0,0 +1,6 @@
1
+from .gen_author_data import gen_author_data
2
+from .gen_author_totals import gen_author_totals_data
3
+from .gen_tag_data import gen_tag_data
4
+from .gen_revision_data import gen_revision_data
5
+from .gen_file_data import gen_file_data
6
+from .gen_loc_data import gen_loc_data

+ 88
- 0
gitstats/data_generators/gen_author_data.py Visa fil

@@ -0,0 +1,88 @@
1
+import csv
2
+import logging
3
+import os
4
+import re
5
+
6
+from gitstats import cli, cd
7
+from gitstats.miscfuncs import getlogrange, getpipeoutput, getstatsummarycounts
8
+from gitstats.data import AuthorRow
9
+
10
+
11
+def gen_author_data(conf, row_processor):
12
+    '''
13
+    Given a configuration, pull authorship information. For
14
+    each author, callback to the row_processor passing an AuthorRow
15
+
16
+    :param conf: configuration (mostly used for date limits)
17
+    :param row_processor: function to receive the callback
18
+    :return: None
19
+    '''
20
+
21
+    # DBG: git log --shortstat --date-order --pretty=format:"%H %at %aN" --since="2017-10-01" "HEAD"
22
+    # Results are in the form of
23
+    #
24
+    # 3c16756701d264619db0b309f42ebdc713b29827 1522513256 Dan Rapp
25
+    # 524ee0d32ffbbb8bb82966b769bbf7dbc1d87a68 1522480979 Michael Wright
26
+    # 1 file changed, 6 insertions(+)
27
+    #
28
+    # If there are two (or more) lines,
29
+    # The first line(s) is the merge to master or other branch
30
+    # The last line is the commit on the branch
31
+    lines = getpipeoutput(
32
+        ['git log --shortstat --date-order --pretty=format:"%%H %%at %%aN" %s' % (
33
+            getlogrange(conf, 'HEAD'))]).split('\n')
34
+    lines.reverse()
35
+
36
+    files = 0
37
+    inserted = 0
38
+    deleted = 0
39
+    stamp = 0
40
+    for line in lines:
41
+        if len(line) == 0:
42
+            continue
43
+
44
+        # <stamp> <author>
45
+        if re.search('files? changed', line) is None:
46
+            if files + inserted + deleted > 0:  # this case indicates we've already processed the line
47
+                pos = line.find(' ')
48
+                if pos != -1:
49
+                    try:
50
+                        oldstamp = stamp
51
+                        tokens = line.split()
52
+                        sha = tokens[0]
53
+                        stamp = int(tokens[1])
54
+                        author = ' '.join(tokens[2:])
55
+                        if oldstamp > stamp:
56
+                            # clock skew, keep old timestamp to avoid having ugly graph
57
+                            stamp = oldstamp
58
+                        row_processor(AuthorRow(sha, stamp, author, files, inserted, deleted))
59
+                        # Since subsequent lines are (generally) reflections of merging into a branch
60
+                        # don't provide "credit" to the author did the merge
61
+                        (files, inserted, deleted) = 0, 0, 0
62
+                    except ValueError:
63
+                        logging.warning(f'unexpected line "{line}')
64
+                else:
65
+                    logging.warning(f'unexpected line "{line}')
66
+        else:
67
+            numbers = getstatsummarycounts(line)
68
+
69
+            if len(numbers) == 3:
70
+                (files, inserted, deleted) = map(lambda el: int(el), numbers)
71
+            else:
72
+                logging.warning(f'Failed to handle line "{line}"')
73
+                (files, inserted, deleted) = (0, 0, 0)
74
+
75
+if __name__ == "__main__":
76
+    conf, paths, outputpath = cli.get_cli()
77
+    with open(outputpath, 'w', encoding='utf8') as f:
78
+        writer = csv.writer(f)
79
+        writer.writerow(['repo', 'sha', 'stamp', 'author', 'files changed', 'lines inserted', 'lines deleted'])
80
+
81
+        for path in paths:
82
+            repo_name = os.path.split(path)[1]
83
+            with (cd.cd(path)):
84
+
85
+                gen_author_data(
86
+                    conf,
87
+                    lambda row: writer.writerow([repo_name, row.sha, row.stamp, row.author, row.files_modified,
88
+                                                 row.lines_inserted, row.lines_deleted]))

+ 47
- 0
gitstats/data_generators/gen_author_totals.py Visa fil

@@ -0,0 +1,47 @@
1
+import csv
2
+import os
3
+
4
+from gitstats import cli, cd
5
+from gitstats.miscfuncs import getlogrange, getpipeoutput
6
+from gitstats.data import AuthorTotals
7
+
8
+
9
+def gen_author_totals_data(conf, row_processor=None, revision_range=None):
10
+    '''
11
+    Given configuration, pull total commit per author. For
12
+    each "row" callback to the row_processor passing an AuthorTotals
13
+
14
+    :param conf: configuration (mostly used for date limits)
15
+    :param row_processor: function to receive the callback
16
+    :return: count of the number of authors
17
+    '''
18
+
19
+    # DBG: git shortlog -s --since="2017-10-01" "HEAD"
20
+    if not revision_range:
21
+        revision_range = getlogrange(conf)
22
+    lines = getpipeoutput(['git shortlog -s %s' % revision_range]).split('\n')
23
+    count = 0
24
+    for line in lines:
25
+        line = line.strip()
26
+        if not line:
27
+            continue
28
+        count += 1
29
+        if row_processor:
30
+            tokens = line.split()
31
+            commit_count = int(tokens[0])
32
+            author = ' '.join(tokens[1:])
33
+            row_processor(AuthorTotals(author, commit_count))
34
+    return count
35
+
36
+if __name__ == "__main__":
37
+    conf, paths, outputpath = cli.get_cli()
38
+    with open(outputpath, 'w', encoding='utf8') as f:
39
+        writer = csv.writer(f)
40
+        writer.writerow(['repo', 'author', 'commits'])
41
+
42
+        for path in paths:
43
+            repo_name = os.path.split(path)[1]
44
+            with (cd.cd(path)):
45
+                gen_author_totals_data(
46
+                    conf,
47
+                    lambda row: writer.writerow([repo_name, row.author, row.total_commits]))

+ 65
- 0
gitstats/data_generators/gen_file_data.py Visa fil

@@ -0,0 +1,65 @@
1
+import csv
2
+import os
3
+import re
4
+
5
+from multiprocessing import Pool
6
+
7
+from gitstats import cli, cd
8
+from gitstats.miscfuncs import getcommitrange, getpipeoutput, getnumoflinesinblob
9
+from gitstats.data import File
10
+
11
+
12
+def gen_file_data(conf, row_processor):
13
+    '''
14
+    Given a configuration, pull authorship information. For
15
+    each author, callback to the row_processor passing an AuthorRow
16
+
17
+    :param conf: configuration (mostly used for date limits)
18
+    :param row_processor: function to receive the callback
19
+    :return: None
20
+    '''
21
+
22
+    # extensions and size of files
23
+
24
+    # DBG: git ls-tree -r -l -z HEAD
25
+    lines = getpipeoutput(['git ls-tree -r -l -z %s' % getcommitrange(conf, 'HEAD', end_only=True)]).split(
26
+        '\000')
27
+    blobs_to_read = {} # blob_id -> File
28
+    for line in lines:
29
+        if len(line) == 0:
30
+            continue
31
+        parts = re.split('\s+', line, 4)
32
+        if parts[0] == '160000' and parts[3] == '-':
33
+            # skip submodules
34
+            continue
35
+        blob_id = parts[2]
36
+        size = int(parts[3])
37
+        fullpath = parts[4]
38
+        _, ext = os.path.splitext(fullpath)
39
+        blobs_to_read[blob_id] = File(fullpath, ext, size)
40
+
41
+    # DBG: git cat-file blob e4f17a621893811250be96c8ef9c37b5e97a1df7', 'wc -l'
42
+    pool = Pool(processes=conf['processes'])
43
+    blob_linecount = pool.map(getnumoflinesinblob, blobs_to_read.keys())
44
+    pool.terminate()
45
+    pool.join()
46
+    # Update cache and write down info about number of number of lines
47
+    for (blob_id, linecount) in blob_linecount:
48
+        file_data = blobs_to_read[blob_id]
49
+        file_data.lines = linecount
50
+        row_processor(file_data)
51
+
52
+
53
+if __name__ == "__main__":
54
+    conf, paths, outputpath = cli.get_cli()
55
+    with open(outputpath, 'w', encoding='utf8') as f:
56
+        writer = csv.writer(f)
57
+        writer.writerow(['repo', 'file', 'ext', 'size', 'line_count'])
58
+
59
+        for path in paths:
60
+            repo_name = os.path.split(path)[1]
61
+            with (cd.cd(path)):
62
+
63
+                gen_file_data(
64
+                    conf,
65
+                    lambda row: writer.writerow([repo_name, row.full_path, row.ext, row.size, row.lines]))

+ 80
- 0
gitstats/data_generators/gen_loc_data.py Visa fil

@@ -0,0 +1,80 @@
1
+import csv
2
+import logging
3
+import os
4
+import re
5
+
6
+from gitstats import cli, cd
7
+from gitstats.miscfuncs import getlogrange, getpipeoutput, getstatsummarycounts
8
+from gitstats.data import LocByDate
9
+
10
+
11
+def gen_loc_data(conf, row_processor):
12
+    '''
13
+    Given a configuration, pull authorship information. For
14
+    each author, callback to the row_processor passing an AuthorRow
15
+
16
+    :param conf: configuration (mostly used for date limits)
17
+    :param row_processor: function to receive the callback
18
+    :return: total lines in repo
19
+    '''
20
+
21
+    # line statistics
22
+    # outputs:
23
+    #  N files changed, N insertions (+), N deletions(-)
24
+
25
+    # computation of lines of code by date is better done
26
+    # on a linear history.
27
+    extra = ''
28
+    if conf['linear_linestats']:
29
+        extra = '--first-parent -m'
30
+
31
+    # DBG: git log --shortstat --first-parent -m --pretty=format:"%at %aN" --since="2017-10-01" "HEAD"'
32
+    lines = getpipeoutput(
33
+        ['git log --shortstat %s --pretty=format:"%%H %%at %%aN" %s' % (extra, getlogrange(conf, 'HEAD'))]).split('\n')
34
+    lines.reverse()
35
+    files = 0
36
+    inserted = 0
37
+    deleted = 0
38
+    total_lines = 0
39
+    for line in lines:
40
+        if len(line) == 0:
41
+            continue
42
+
43
+        if re.search('files? changed', line) is None:
44
+            line = line.strip()
45
+            if line:
46
+                try:
47
+                    parts = line.split(' ', 2)
48
+                    (hash, stamp, author) = (parts[0], int(parts[1]), parts[2])
49
+                    row_processor(LocByDate(hash, stamp, files, inserted, deleted, total_lines))
50
+                    files, inserted, deleted = 0, 0, 0
51
+                except ValueError:
52
+                    logging.warning(f'unexpected line "{line}')
53
+            else:
54
+                logging.warning(f'unexpected line "{line}')
55
+        else:
56
+            numbers = getstatsummarycounts(line)
57
+
58
+            if len(numbers) == 3:
59
+                (files, inserted, deleted) = map(lambda el: int(el), numbers)
60
+                total_lines += inserted
61
+                total_lines -= deleted
62
+            else:
63
+                logging.warning(f'Failed to handle line "{line}"')
64
+                (files, inserted, deleted) = (0, 0, 0)
65
+    return total_lines
66
+
67
+
68
+if __name__ == "__main__":
69
+    conf, paths, outputpath = cli.get_cli()
70
+    with open(outputpath, 'w', encoding='utf8') as f:
71
+        writer = csv.writer(f)
72
+        writer.writerow(['repo', 'sha', 'stamp', 'file count', 'lines inserted', 'lines deleted', 'total lines'])
73
+
74
+        for path in paths:
75
+            repo_name = os.path.split(path)[1]
76
+            with (cd.cd(path)):
77
+
78
+                gen_loc_data(
79
+                    conf,
80
+                    lambda row: writer.writerow([repo_name, row.hash, row.stamp, row.file_count, row.lines_inserted, row.lines_deleted, row.total_lines]))

+ 75
- 0
gitstats/data_generators/gen_revision_data.py Visa fil

@@ -0,0 +1,75 @@
1
+import csv
2
+import os
3
+
4
+from multiprocessing import Pool
5
+
6
+from gitstats import cli, cd
7
+from gitstats.miscfuncs import getlogrange, getpipeoutput, getnumoffilesfromrev
8
+from gitstats.data import Revision
9
+
10
+
11
+def gen_revision_data(conf, row_processor):
12
+    '''
13
+    Given a configuration, pull revision information. For
14
+    each author, callback to the row_processor passing an Revision
15
+
16
+    :param conf: configuration (mostly used for date limits)
17
+    :param row_processor: function to receive the callback
18
+    :return: Number of commits
19
+    '''
20
+
21
+    revisions = {} # tree_hash -> Revision
22
+    # Collect revision statistics
23
+    # Outputs "<stamp> <date> <time> <timezone> <author> '<' <mail> '>'"
24
+
25
+    # DBG: git rev-list --pretty=format:"%at %ai %aN <%aE>" --since="2017-10-01" "HEAD"', 'grep -v ^commit'
26
+    lines = getpipeoutput(
27
+        ['git rev-list --pretty=format:"%%T %%H %%at %%ai %%aN <%%aE>" %s' % getlogrange(conf, 'HEAD'),
28
+         'grep -v ^commit']).split('\n')
29
+    for line in lines:
30
+        parts = line.split(' ', 6)
31
+        tree_hash = parts[0]
32
+        sha = parts[1]
33
+        try:
34
+            stamp = int(parts[2])
35
+        except ValueError:
36
+            stamp = 0
37
+        timezone = parts[5]
38
+        author, mail = parts[6].split('<', 1)
39
+        author = author.strip()
40
+        mail = mail.rstrip('>')
41
+        domain = '?'
42
+        if mail.find('@') != -1:
43
+            domain = mail.rsplit('@', 1)[1]
44
+            domain.rstrip('>')
45
+        revisions[tree_hash] = Revision(sha, stamp, timezone, author, mail, domain)
46
+
47
+    # todo: consider putting in a cache for this. There was one in the original code
48
+    # DBG: git ls-tree -r --name-only "ceb3165b51ae0680724fd71e16a5ff836a0de41e"', 'wc -l'
49
+    pool = Pool(processes=conf['processes'])
50
+    rev_count = pool.map(getnumoffilesfromrev, revisions.keys())
51
+    pool.terminate()
52
+    pool.join()
53
+    # Update cache with new revisions and append then to general list
54
+    for (rev, count) in rev_count:
55
+        revision = revisions[rev]
56
+        revision.file_count = count
57
+        row_processor(revision)
58
+
59
+    return len(lines)
60
+
61
+
62
+if __name__ == "__main__":
63
+    conf, paths, outputpath = cli.get_cli()
64
+    with open(outputpath, 'w', encoding='utf8') as f:
65
+        writer = csv.writer(f)
66
+        writer.writerow(['repo', 'sha', 'stamp', 'timezone', 'author', 'email', 'domain', 'files_changed'])
67
+
68
+        for path in paths:
69
+            repo_name = os.path.split(path)[1]
70
+            with (cd.cd(path)):
71
+
72
+                gen_revision_data(
73
+                    conf,
74
+                    lambda row: writer.writerow([repo_name, row.sha, row.stamp, row.timezone, row.author, row.email,
75
+                                                 row.domain, row.file_count]))

+ 74
- 0
gitstats/data_generators/gen_tag_data.py Visa fil

@@ -0,0 +1,74 @@
1
+import csv
2
+import os
3
+
4
+from collections import defaultdict
5
+from typing import Dict
6
+
7
+from gitstats import cli, cd
8
+from gitstats.miscfuncs import getpipeoutput
9
+from gitstats.data import AuthorTotals, Tag
10
+from gitstats.data_generators import gen_author_totals_data
11
+
12
+
13
+def gen_tag_data(conf, row_processor):
14
+    '''
15
+    Given a configuration, pull tag information. For
16
+    each tag, callback to the row_processor passing a Tag
17
+
18
+    :param conf: configuration (mostly used for date limits)
19
+    :param row_processor: function to receive the callback
20
+    :return: None
21
+    '''
22
+
23
+    # tags
24
+    tags = {} # stamp -> tags
25
+    lines = getpipeoutput(['git show-ref --tags']).split('\n')
26
+    for line in lines:
27
+        if len(line) == 0:
28
+            continue
29
+        (line_hash, tag) = line.split(' ')
30
+
31
+        tag = tag.replace('refs/tags/', '')
32
+        output = getpipeoutput(['git log "%s" --pretty=format:"%%at" -n 1' % line_hash])
33
+
34
+        stamp = 0
35
+        if len(output) > 0:
36
+            try:
37
+                stamp = int(output.strip())
38
+            except ValueError:
39
+                stamp = 0
40
+
41
+        tags[stamp] = Tag(tag, stamp, line_hash, 0, {})
42
+
43
+    stamps = sorted(tags.keys())
44
+    prev = None
45
+    for stamp in stamps:
46
+        def process_row(row: AuthorTotals):
47
+            tags[stamp].authors[row.author] = row.total_commits
48
+            tags[stamp].commits += row.total_commits
49
+
50
+        revision_tags = tags[stamp].tag
51
+        if prev != None:
52
+            revision_tags += ' "^%s"' % prev
53
+
54
+        gen_author_totals_data(conf, process_row, revision_tags)
55
+        row_processor(tags[stamp])
56
+
57
+        prev = tags[stamp].tag
58
+
59
+
60
+if __name__ == "__main__":
61
+    conf, paths, outputpath = cli.get_cli()
62
+    with open(outputpath, 'w', encoding='utf8') as f:
63
+        writer = csv.writer(f)
64
+        writer.writerow(['repo', 'sha', 'stamp', 'commits for tag', 'author', 'commits by author'])
65
+
66
+        for path in paths:
67
+            repo_name = os.path.split(path)[1]
68
+            with (cd.cd(path)):
69
+
70
+                def process_row(row):
71
+                    for author, commits in row.authors.items():
72
+                        writer.writerow([repo_name, row.hash, row.stamp, row.commits, author, commits])
73
+
74
+                gen_tag_data(conf, process_row)

+ 2
- 1
gitstats/datacollector.py Visa fil

@@ -5,6 +5,7 @@ import pickle
5 5
 import time
6 6
 import zlib
7 7
 
8
+from collections import defaultdict
8 9
 from typing import Dict
9 10
 from gitstats.data.author import Author
10 11
 
@@ -26,7 +27,7 @@ class DataCollector:
26 27
         self.activity_by_year_week = {}  # yy_wNN -> commits
27 28
         self.activity_by_year_week_peak = 0
28 29
 
29
-        self.authors: Dict[Author] = {}  # name -> Author
30
+        self.authors: Dict[Author] = defaultdict(lambda: Author())
30 31
 
31 32
         self.total_commits = 0
32 33
         self.total_files = 0

+ 69
- 260
gitstats/gitdatacollector.py Visa fil

@@ -1,14 +1,12 @@
1 1
 import datetime
2
-import logging
3
-import re
4
-import os
5 2
 
6
-from multiprocessing import Pool
3
+from collections import defaultdict
7 4
 
8
-from gitstats.data.author import Author
9 5
 from gitstats.datacollector import DataCollector
10
-from gitstats.miscfuncs import getcommitrange, getlogrange, getnumoffilesfromrev, getnumoflinesinblob, \
11
-    getpipeoutput, getstatsummarycounts
6
+from gitstats.data import Author, AuthorRow, File, LocByDate, Revision, Tag
7
+from gitstats.data_generators import gen_author_data, gen_author_totals_data, gen_tag_data, gen_revision_data, \
8
+    gen_file_data, gen_loc_data
9
+from gitstats.miscfuncs import getpipeoutput
12 10
 
13 11
 
14 12
 class GitDataCollector(DataCollector):
@@ -18,201 +16,77 @@ class GitDataCollector(DataCollector):
18 16
     def collect(self, directory):
19 17
         super(GitDataCollector, self).collect(directory)
20 18
 
21
-        self.total_authors += int(getpipeoutput(['git shortlog -s %s' % getlogrange(self.conf), 'wc -l']))
22
-        # self.total_lines = int(getoutput('git-ls-files -z |xargs -0 cat |wc -l'))
19
+        self.total_authors += self.get_total_authors()
23 20
         self.get_tags()
24 21
         self.get_revision_info()
25 22
         self.get_file_info()
26 23
         self.get_loc_info()
27 24
         self.get_author_info()
28 25
 
29
-    def xlate(self, name):
30
-        if name in self.conf['name_xlate']:
31
-            return self.conf['name_xlate'][name]
32
-        return name
26
+    def get_total_authors(self):
27
+        return gen_author_totals_data(self.conf)
33 28
 
34 29
     def get_author_info(self):
35 30
         # Per-author statistics
36 31
         # defined for stamp, author only if author commited at this timestamp.
37
-        self.changes_by_date_by_author = {}  # stamp -> author -> lines_added
38
-        # Similar to the above, but never use --first-parent
39
-        # (we need to walk through every commit to know who
40
-        # committed what, not just through mainline)
41
-        lines = getpipeoutput(
42
-            ['git log --shortstat --date-order --pretty=format:"%%at %%aN" %s' % (
43
-                getlogrange(self.conf, 'HEAD'))]).split('\n')
44
-        lines.reverse()
45
-        inserted = 0
46
-        deleted = 0
47
-        stamp = 0
48
-        for line in lines:
49
-            if len(line) == 0:
50
-                continue
51
-
52
-            # <stamp> <author>
53
-            if re.search('files? changed', line) is None:
54
-                pos = line.find(' ')
55
-                if pos != -1:
56
-                    try:
57
-                        oldstamp = stamp
58
-                        (stamp, author) = (int(line[:pos]), line[pos + 1:])
59
-                        author = self.xlate(author)
60
-                        if oldstamp > stamp:
61
-                            # clock skew, keep old timestamp to avoid having ugly graph
62
-                            stamp = oldstamp
63
-                        if author not in self.authors:
64
-                            self.authors[author] = Author()
65
-                        self.authors[author].commits += 1
66
-                        self.authors[author].lines_added += inserted
67
-                        self.authors[author].lines_removed += deleted
68
-                        if stamp not in self.changes_by_date_by_author:
69
-                            self.changes_by_date_by_author[stamp] = {}
70
-                        if author not in self.changes_by_date_by_author[stamp]:
71
-                            self.changes_by_date_by_author[stamp][author] = Author()
72
-                        self.changes_by_date_by_author[stamp][author].lines_added = self.authors[author].lines_added
73
-                        self.changes_by_date_by_author[stamp][author].commits = self.authors[author].commits
74
-                        files, inserted, deleted = 0, 0, 0
75
-                    except ValueError:
76
-                        logging.warning(f'unexpected line "{line}')
77
-                else:
78
-                    logging.warning(f'unexpected line "{line}')
79
-            else:
80
-                numbers = getstatsummarycounts(line)
81 32
 
82
-                if len(numbers) == 3:
83
-                    (files, inserted, deleted) = map(lambda el: int(el), numbers)
84
-                else:
85
-                    logging.warning(f'Failed to handle line "{line}"')
86
-                    (files, inserted, deleted) = (0, 0, 0)
33
+        self.changes_by_date_by_author = defaultdict(lambda: defaultdict(lambda: Author())) # stamp -> author -> lines_added
34
+
35
+        def row_processor(row: AuthorRow):
36
+            self.authors[row.author].commits += 1
37
+            self.authors[row.author].lines_added += row.lines_inserted
38
+            self.authors[row.author].lines_removed += row.lines_deleted
39
+            self.changes_by_date_by_author[row.stamp][row.author].lines_added = self.authors[row.author].lines_added
40
+            self.changes_by_date_by_author[row.stamp][row.author].commits = self.authors[row.author].commits
41
+
42
+        gen_author_data(self.conf, row_processor)
87 43
 
88 44
     def get_loc_info(self):
89
-        # line statistics
90
-        # outputs:
91
-        #  N files changed, N insertions (+), N deletions(-)
92
-        # <stamp> <author>
45
+
93 46
         self.changes_by_date = {}  # stamp -> { files, ins, del }
94
-        # computation of lines of code by date is better done
95
-        # on a linear history.
96
-        extra = ''
97
-        if self.conf['linear_linestats']:
98
-            extra = '--first-parent -m'
99
-        lines = getpipeoutput(
100
-            ['git log --shortstat %s --pretty=format:"%%at %%aN" %s' % (extra, getlogrange(self.conf, 'HEAD'))]).split(
101
-            '\n')
102
-        lines.reverse()
103
-        files = 0
104
-        inserted = 0
105
-        deleted = 0
106
-        total_lines = 0
107
-        for line in lines:
108
-            if len(line) == 0:
109
-                continue
110
-
111
-            # <stamp> <author>
112
-            if re.search('files? changed', line) is None:
113
-                pos = line.find(' ')
114
-                if pos != -1:
115
-                    try:
116
-                        (stamp, author) = (int(line[:pos]), line[pos + 1:])
117
-                        self.changes_by_date[stamp] = {'files': files, 'ins': inserted, 'del': deleted,
118
-                                                       'lines': total_lines}
119
-
120
-                        date = datetime.datetime.fromtimestamp(stamp)
121
-                        yymm = date.strftime('%Y-%m')
122
-                        self.lines_added_by_month[yymm] = self.lines_added_by_month.get(yymm, 0) + inserted
123
-                        self.lines_removed_by_month[yymm] = self.lines_removed_by_month.get(yymm, 0) + deleted
124
-
125
-                        yy = date.year
126
-                        self.lines_added_by_year[yy] = self.lines_added_by_year.get(yy, 0) + inserted
127
-                        self.lines_removed_by_year[yy] = self.lines_removed_by_year.get(yy, 0) + deleted
128
-
129
-                        files, inserted, deleted = 0, 0, 0
130
-                    except ValueError:
131
-                        logging.warning(f'unexpected line "{line}')
132
-                else:
133
-                    logging.warning(f'unexpected line "{line}')
134
-            else:
135
-                numbers = getstatsummarycounts(line)
47
+        def row_processor(row: LocByDate):
48
+            self.changes_by_date[row.stamp] = {
49
+                'files': row.file_count,
50
+                'ins': row.lines_inserted,
51
+                'del': row.lines_deleted,
52
+                'lines': row.total_lines
53
+            }
54
+            date = datetime.datetime.fromtimestamp(row.stamp)
55
+            yymm = date.strftime('%Y-%m')
56
+            self.lines_added_by_month[yymm] = self.lines_added_by_month.get(yymm, 0) + row.lines_inserted
57
+            self.lines_removed_by_month[yymm] = self.lines_removed_by_month.get(yymm, 0) + row.lines_deleted
58
+
59
+            yy = date.year
60
+            self.lines_added_by_year[yy] = self.lines_added_by_year.get(yy, 0) + row.lines_inserted
61
+            self.lines_removed_by_year[yy] = self.lines_removed_by_year.get(yy, 0) + row.lines_deleted
136 62
 
137
-                if len(numbers) == 3:
138
-                    (files, inserted, deleted) = map(lambda el: int(el), numbers)
139
-                    total_lines += inserted
140
-                    total_lines -= deleted
141
-                    self.total_lines_added += inserted
142
-                    self.total_lines_removed += deleted
63
+            self.total_lines_added += row.lines_inserted
64
+            self.total_lines_removed += row.lines_deleted
143 65
 
144
-                else:
145
-                    logging.warning(f'Failed to handle line "{line}"')
146
-                    (files, inserted, deleted) = (0, 0, 0)
147
-            # self.changes_by_date[stamp] = { 'files': files, 'ins': inserted, 'del': deleted }
148
-        self.total_lines += total_lines
66
+        self.total_lines += gen_loc_data(self.conf, row_processor)
149 67
 
150 68
     def get_file_info(self):
151 69
         # extensions and size of files
152
-        lines = getpipeoutput(['git ls-tree -r -l -z %s' % getcommitrange(self.conf, 'HEAD', end_only=True)]).split(
153
-            '\000')
154
-        blobs_to_read = []
155
-        for line in lines:
156
-            if len(line) == 0:
157
-                continue
158
-            parts = re.split('\s+', line, 4)
159
-            if parts[0] == '160000' and parts[3] == '-':
160
-                # skip submodules
161
-                continue
162
-            blob_id = parts[2]
163
-            size = int(parts[3])
164
-            fullpath = parts[4]
165
-
166
-            self.total_size += size
70
+        def row_processor(row: File):
71
+            self.total_size += row.size
167 72
             self.total_files += 1
73
+            if row.ext not in self.extensions:
74
+                self.extensions[row.ext] = {'files': 0, 'lines': 0}
75
+            self.extensions[row.ext]['files'] += 1
76
+            self.extensions[row.ext]['lines'] += row.lines
168 77
 
169
-            _, ext = os.path.splitext(fullpath)
170
-            if len(ext) > self.conf['max_ext_length']:
171
-                ext = ''
172
-            if ext not in self.extensions:
173
-                self.extensions[ext] = {'files': 0, 'lines': 0}
174
-            self.extensions[ext]['files'] += 1
175
-            # if cache empty then add ext and blob id to list of new blob's
176
-            # otherwise try to read needed info from cache
177
-            if 'lines_in_blob' not in self.cache.keys():
178
-                blobs_to_read.append((ext, blob_id))
179
-                continue
180
-            if blob_id in self.cache['lines_in_blob'].keys():
181
-                self.extensions[ext]['lines'] += self.cache['lines_in_blob'][blob_id]
182
-            else:
183
-                blobs_to_read.append((ext, blob_id))
184
-        # Get info abount line count for new blob's that wasn't found in cache
185
-        pool = Pool(processes=self.conf['processes'])
186
-        ext_blob_linecount = pool.map(getnumoflinesinblob, blobs_to_read)
187
-        pool.terminate()
188
-        pool.join()
189
-        # Update cache and write down info about number of number of lines
190
-        for (ext, blob_id, linecount) in ext_blob_linecount:
191
-            if 'lines_in_blob' not in self.cache:
192
-                self.cache['lines_in_blob'] = {}
193
-            self.cache['lines_in_blob'][blob_id] = linecount
194
-            self.extensions[ext]['lines'] += self.cache['lines_in_blob'][blob_id]
78
+        gen_file_data(self.conf, row_processor)
195 79
 
196 80
     def get_revision_info(self):
197 81
         # Collect revision statistics
198 82
         # Outputs "<stamp> <date> <time> <timezone> <author> '<' <mail> '>'"
199
-        lines = getpipeoutput(
200
-            ['git rev-list --pretty=format:"%%at %%ai %%aN <%%aE>" %s' % getlogrange(self.conf, 'HEAD'),
201
-             'grep -v ^commit']).split(
202
-            '\n')
203
-        for line in lines:
204
-            parts = line.split(' ', 4)
205
-            try:
206
-                stamp = int(parts[0])
207
-            except ValueError:
208
-                stamp = 0
209
-            timezone = parts[3]
210
-            author, mail = parts[4].split('<', 1)
211
-            author = self.xlate(author.rstrip())
212
-            mail = mail.rstrip('>')
213
-            domain = '?'
214
-            if mail.find('@') != -1:
215
-                domain = mail.rsplit('@', 1)[1]
83
+
84
+        def row_processor(row: Revision):
85
+            stamp = row.stamp
86
+            domain = row.domain
87
+            author = row.author
88
+            timezone = row.timezone
89
+
216 90
             date = datetime.datetime.fromtimestamp(float(stamp))
217 91
 
218 92
             # First and last commit stamp (may be in any order because of cherry-picking and patches)
@@ -258,8 +132,6 @@ class GitDataCollector(DataCollector):
258 132
                 self.activity_by_year_week_peak = self.activity_by_year_week[yyw]
259 133
 
260 134
             # author stats
261
-            if author not in self.authors:
262
-                self.authors[author] = Author()
263 135
             self.authors[author].activity_by_day_and_hour[day][hour] += 1
264 136
             # commits, note again that commits may be in any date order because of cherry-picking and patches
265 137
             if not self.authors[author].last_commit_stamp:
@@ -303,99 +175,36 @@ class GitDataCollector(DataCollector):
303 175
 
304 176
             # timezone
305 177
             self.commits_by_timezone[timezone] = self.commits_by_timezone.get(timezone, 0) + 1
306
-        # outputs "<stamp> <files>" for each revision
307
-        revlines = getpipeoutput(
308
-            ['git rev-list --pretty=format:"%%at %%T" %s' % getlogrange(self.conf, 'HEAD'),
309
-             'grep -v ^commit']).strip().split('\n')
310
-        lines = []
311
-        revs_to_read = []
312
-        # Look up rev in cache and take info from cache if found
313
-        # If not append rev to list of rev to read from repo
314
-        for revline in revlines:
315
-            time, rev = revline.split(' ')
316
-            # if cache empty then add time and rev to list of new rev's
317
-            # otherwise try to read needed info from cache
318
-            if 'files_in_tree' not in self.cache.keys():
319
-                revs_to_read.append((time, rev))
320
-                continue
321
-            if rev in self.cache['files_in_tree'].keys():
322
-                lines.append('%d %d' % (int(time), self.cache['files_in_tree'][rev]))
323
-            else:
324
-                revs_to_read.append((time, rev))
325
-        # Read revisions from repo
326
-        pool = Pool(processes=self.conf['processes'])
327
-        time_rev_count = pool.map(getnumoffilesfromrev, revs_to_read)
328
-        pool.terminate()
329
-        pool.join()
330
-        # Update cache with new revisions and append then to general list
331
-        for (time, rev, count) in time_rev_count:
332
-            if 'files_in_tree' not in self.cache:
333
-                self.cache['files_in_tree'] = {}
334
-            self.cache['files_in_tree'][rev] = count
335
-            lines.append('%d %d' % (int(time), count))
336
-        self.total_commits += len(lines)
337
-        for line in lines:
338
-            parts = line.split(' ')
339
-            if len(parts) != 2:
340
-                continue
341
-            (stamp, files) = parts[0:2]
342
-            try:
343
-                self.files_by_stamp[int(stamp)] = int(files)
344
-            except ValueError:
345
-                logging.warning(f'Failed to parse line "{line}"')
178
+
179
+            # file counts
180
+            self.files_by_stamp[stamp] = row.file_count
181
+
182
+        self.total_commits += gen_revision_data(self.conf, row_processor)
346 183
 
347 184
     def get_tags(self):
348
-        # tags
349
-        lines = getpipeoutput(['git show-ref --tags']).split('\n')
350
-        for line in lines:
351
-            if len(line) == 0:
352
-                continue
353
-            (line_hash, tag) = line.split(' ')
354
-
355
-            tag = tag.replace('refs/tags/', '')
356
-            output = getpipeoutput(['git log "%s" --pretty=format:"%%at %%aN" -n 1' % line_hash])
357
-            if len(output) > 0:
358
-                parts = output.split(' ')
359
-                try:
360
-                    stamp = int(parts[0])
361
-                except ValueError:
362
-                    stamp = 0
363
-                self.tags[tag] = {'stamp': stamp,
364
-                                  'hash': line_hash,
365
-                                  'date': datetime.datetime.fromtimestamp(stamp).strftime('%Y-%m-%d'),
366
-                                  'commits': 0,
367
-                                  'authors': {}}
368
-        # collect info on tags, starting from latest
369
-        tags_sorted_by_date_asc = [tup[1] for tup in sorted([(el[1]['date'], el[0]) for el in self.tags.items()])]
370
-        # tags_sorted_by_date_desc = map(lambda el: el[1],
371
-        #                                reversed(sorted(map(lambda el: (el[1]['date'], el[0]), self.tags.items()))))
372
-        prev = None
373
-        #        for tag in reversed(tags_sorted_by_date_desc):
374
-        for tag in tags_sorted_by_date_asc:
375
-            cmd = 'git shortlog -s "%s"' % tag
376
-            if prev is not None:
377
-                cmd += ' "^%s"' % prev
378
-            output = getpipeoutput([cmd])
379
-            if len(output) == 0:
380
-                continue
381
-            prev = tag
382
-            for line in output.split('\n'):
383
-                parts = re.split('\s+', line, 2)
384
-                commits = int(parts[1])
385
-                author = parts[2]
386
-                self.tags[tag]['commits'] += commits
387
-                self.tags[tag]['authors'][author] = commits
185
+        def row_processor(row: Tag):
186
+            self.tags[row.tag] = {
187
+                'stamp': row.stamp,
188
+                'hash': row.hash,
189
+                'date': datetime.datetime.fromtimestamp(row.stamp).strftime('%Y-%m-%d'),
190
+                'commits': row.commits,
191
+                'authors': row.authors
192
+            }
193
+
194
+        gen_tag_data(self.conf, row_processor)
388 195
 
389 196
     def refine(self):
390 197
         # authors
391 198
         # name -> {place_by_commits, commits_frac, date_first, date_last, timedelta}
392 199
         self.authors_by_commits = self.getAuthors()
200
+        total_commits_without_merge = 0
393 201
         for i, name in enumerate(self.authors_by_commits):
394 202
             self.authors[name].place_by_commits = i + 1
203
+            total_commits_without_merge += self.authors[name].commits
395 204
 
396 205
         for name in self.authors.keys():
397 206
             a = self.authors[name]
398
-            a.commits_frac = (100 * float(a.commits)) / self.getTotalCommits()
207
+            a.commits_frac = (100 * float(a.commits)) / total_commits_without_merge
399 208
             date_first = datetime.datetime.fromtimestamp(a.first_commit_stamp)
400 209
             date_last = datetime.datetime.fromtimestamp(a.last_commit_stamp)
401 210
             delta = date_last - date_first

+ 0
- 138
gitstats/gitstats.py Visa fil

@@ -1,138 +0,0 @@
1
-#!/usr/bin/python
2
-# Copyright (c) 2007-2014 Heikki Hokkanen <hoxu@users.sf.net> & others (see doc/AUTHOR)
3
-# GPLv2 / GPLv3
4
-import getopt
5
-import logging
6
-import os
7
-import sys
8
-import time
9
-
10
-import multiprocessing_logging
11
-
12
-from collections import defaultdict
13
-
14
-from .gitdatacollector import GitDataCollector
15
-from .htmlreportcreator import HTMLReportCreator
16
-from .miscfuncs import getgnuplotversion
17
-
18
-exectime_internal = 0.0
19
-exectime_external = 0.0
20
-
21
-conf = {
22
-    'max_domains': 10,
23
-    'max_ext_length': 10,
24
-    'style': 'gitstats.css',
25
-    'max_authors': 20,
26
-    'authors_top': 5,
27
-    'commit_begin': '',
28
-    'commit_end': 'HEAD',
29
-    'linear_linestats': 1,
30
-    'project_name': '',
31
-    'processes': 8,
32
-    'start_date': '',
33
-    'logging': logging.INFO,
34
-    'name_xlate': defaultdict(dict)
35
-}
36
-
37
-class GitStats:
38
-    def _usage(self):
39
-        print(f"""
40
-    Usage: gitstats [options] <gitpath..> <outputpath>
41
-
42
-    Options:
43
-    -c key=value     Override configuration value
44
-    -n key=value     Define author name equivalency (key will treated the same as value)
45
-
46
-    Default config values:
47
-    {conf}
48
-
49
-    Please see the manual page for more details.
50
-    """)
51
-
52
-    def run(self):
53
-        optlist, args = getopt.getopt(sys.argv[1:], 'hc:n:', ["help"])
54
-        for o, v in optlist:
55
-            if o == '-c':
56
-                key, value = v.split('=', 1)
57
-                if key not in conf:
58
-                    raise KeyError('no such key "%s" in config' % key)
59
-                if isinstance(conf[key], int):
60
-                    conf[key] = int(value)
61
-                else:
62
-                    conf[key] = value
63
-            elif o in ('-h', '--help'):
64
-                self._usage()
65
-                sys.exit()
66
-            elif o == '-n':
67
-                key, value = v.split('=', 1)
68
-                conf['name_xlate'][key] = value
69
-
70
-
71
-        if len(args) < 2:
72
-            self._usage()
73
-            sys.exit(0)
74
-
75
-        outputpath = os.path.abspath(args[-1])
76
-        paths = args[0:-1]
77
-        outputpath = os.path.abspath(outputpath)
78
-
79
-        logging.basicConfig(level=conf['logging'], format='%(message)s')
80
-        multiprocessing_logging.install_mp_handler()
81
-        time_start = time.time()
82
-
83
-
84
-        rundir = os.getcwd()
85
-
86
-        try:
87
-            os.makedirs(outputpath)
88
-        except OSError:
89
-            pass
90
-        if not os.path.isdir(outputpath):
91
-            logging.fatal('Output path is not a directory or does not exist')
92
-            sys.exit(1)
93
-
94
-        if not getgnuplotversion():
95
-            logging.error('gnuplot not found')
96
-            sys.exit(1)
97
-
98
-        logging.info(f'Output path: {outputpath}')
99
-        cachefile = os.path.join(outputpath, 'gitstats.cache')
100
-
101
-        data = GitDataCollector(conf)
102
-        data.loadCache(cachefile)
103
-
104
-        for gitpath in paths:
105
-            logging.info(f'Git path: {gitpath}')
106
-
107
-            prevdir = os.getcwd()
108
-            os.chdir(gitpath)
109
-
110
-            logging.info('Collecting data...')
111
-            data.collect(gitpath)
112
-
113
-            os.chdir(prevdir)
114
-
115
-        data.saveCache(cachefile)
116
-
117
-        logging.info('Refining data...')
118
-        data.refine()
119
-
120
-        os.chdir(rundir)
121
-
122
-        logging.info('Generating report...')
123
-        report = HTMLReportCreator(conf)
124
-        report.create(data, outputpath)
125
-
126
-        time_end = time.time()
127
-        calculated_exectime_internal = time_end - time_start
128
-        logging.info(f'Execution time {calculated_exectime_internal} secs, {exectime_external} secs ({(100.0 * exectime_external) / calculated_exectime_internal}%) in external commands)')
129
-
130
-        print('You may now run:')
131
-        print()
132
-        print('   sensible-browser \'%s\'' % os.path.join(outputpath, 'index.html').replace("'", "'\\''"))
133
-        print()
134
-
135
-
136
-if __name__ == '__main__':
137
-    g = GitStats()
138
-    g.run()

+ 7
- 6
gitstats/miscfuncs.py Visa fil

@@ -80,19 +80,20 @@ def getgnuplotversion():
80 80
     return getpipeoutput(['%s --version' % gnuplot_cmd]).split('\n')[0]
81 81
 
82 82
 
83
-def getnumoffilesfromrev(time_rev):
83
+def getnumoffilesfromrev(tree_hash):
84 84
     """
85 85
     Get number of files changed in commit
86 86
     """
87
-    time_portion, rev = time_rev
88
-    return (int(time_portion), rev, int(getpipeoutput(['git ls-tree -r --name-only "%s"' % rev, 'wc -l']).split('\n')[0]))
87
+    # DBG: git ls-tree -r --name-only "ceb3165b51ae0680724fd71e16a5ff836a0de41e"' | 'wc -l'
88
+    return (tree_hash, int(getpipeoutput(['git ls-tree -r --name-only "%s"' % tree_hash, 'wc -l']).split('\n')[0]))
89 89
 
90 90
 
91
-def getnumoflinesinblob(ext_blob):
91
+def getnumoflinesinblob(blob_id):
92 92
     """
93 93
     Get number of lines in blob
94 94
     """
95
-    ext, blob_id = ext_blob
96
-    return ext, blob_id, int(getpipeoutput(['git cat-file blob %s' % blob_id, 'wc -l']).split()[0])
95
+
96
+    # DBG: git cat-file blob e4f17a621893811250be96c8ef9c37b5e97a1df7', 'wc -l'
97
+    return blob_id, int(getpipeoutput(['git cat-file blob %s' % blob_id, 'wc -l']).split()[0])
97 98
 
98 99
 

+ 81
- 0
gitstats/process.py Visa fil

@@ -0,0 +1,81 @@
1
+#!/usr/bin/python
2
+# Copyright (c) 2007-2014 Heikki Hokkanen <hoxu@users.sf.net> & others (see doc/AUTHOR)
3
+# GPLv2 / GPLv3
4
+import logging
5
+import os
6
+import sys
7
+import time
8
+
9
+import multiprocessing_logging
10
+
11
+from gitstats.gitdatacollector import GitDataCollector
12
+from gitstats.htmlreportcreator import HTMLReportCreator
13
+from gitstats.miscfuncs import getgnuplotversion
14
+from gitstats import cli
15
+
16
+exectime_internal = 0.0
17
+exectime_external = 0.0
18
+
19
+def run():
20
+
21
+    conf, paths, outputpath = cli.get_cli()
22
+
23
+    logging.basicConfig(level=conf['logging'], format='%(message)s')
24
+    multiprocessing_logging.install_mp_handler()
25
+    time_start = time.time()
26
+
27
+
28
+    rundir = os.getcwd()
29
+
30
+    try:
31
+        os.makedirs(outputpath)
32
+    except OSError:
33
+        pass
34
+    if not os.path.isdir(outputpath):
35
+        logging.fatal('Output path is not a directory or does not exist')
36
+        sys.exit(1)
37
+
38
+    if not getgnuplotversion():
39
+        logging.error('gnuplot not found')
40
+        sys.exit(1)
41
+
42
+    logging.info(f'Output path: {outputpath}')
43
+    cachefile = os.path.join(outputpath, 'gitstats.cache')
44
+
45
+    data = GitDataCollector(conf)
46
+    data.loadCache(cachefile)
47
+
48
+    for gitpath in paths:
49
+        logging.info(f'Git path: {gitpath}')
50
+
51
+        prevdir = os.getcwd()
52
+        os.chdir(gitpath)
53
+
54
+        logging.info('Collecting data...')
55
+        data.collect(gitpath)
56
+
57
+        os.chdir(prevdir)
58
+
59
+    data.saveCache(cachefile)
60
+
61
+    logging.info('Refining data...')
62
+    data.refine()
63
+
64
+    os.chdir(rundir)
65
+
66
+    logging.info('Generating report...')
67
+    report = HTMLReportCreator(conf)
68
+    report.create(data, outputpath)
69
+
70
+    time_end = time.time()
71
+    calculated_exectime_internal = time_end - time_start
72
+    logging.info(f'Execution time {calculated_exectime_internal} secs, {exectime_external} secs ({(100.0 * exectime_external) / calculated_exectime_internal}%) in external commands)')
73
+
74
+    print('You may now run:')
75
+    print()
76
+    print('   sensible-browser \'%s\'' % os.path.join(outputpath, 'index.html').replace("'", "'\\''"))
77
+    print()
78
+
79
+
80
+if __name__ == '__main__':
81
+    run()