Quellcode durchsuchen

Refactor data generators into seperate classes

Use callback pattern to handle processing of individual rows of data from data generators
Use dataclasses to define individual data instances
Dan Rapp vor 7 Jahren
Ursprung
Commit
2047313000

+ 2
- 6
gitstats/__init__.py Datei anzeigen

@@ -1,11 +1,7 @@
1
-from gitstats.gitstats import GitStats
1
+from gitstats.process import run
2 2
 from gitstats._version import get_versions
3 3
 __version__ = get_versions()['version']
4 4
 del get_versions
5 5
 
6
-def main():
7
-    g = GitStats()
8
-    g.run()
9
-
10 6
 if __name__ == "__main__":
11
-    main()
7
+    run()

+ 16
- 0
gitstats/cd.py Datei anzeigen

@@ -0,0 +1,16 @@
1
+"""Holds the current working directory context class. A python version of pushd/popd"""
2
+import os
3
+
4
+# pylint: disable=too-few-public-methods
5
+class cd: # pylint: disable=invalid-name
6
+    """Context manager for changing the current working directory"""
7
+    def __init__(self, newPath):
8
+        self.new_path = os.path.expanduser(newPath)
9
+        self.saved_path = os.getcwd()
10
+
11
+    def __enter__(self):
12
+        self.saved_path = os.getcwd()
13
+        os.chdir(self.new_path)
14
+
15
+    def __exit__(self, etype, value, traceback):
16
+        os.chdir(self.saved_path)

+ 60
- 0
gitstats/cli.py Datei anzeigen

@@ -0,0 +1,60 @@
1
+import getopt
2
+import logging
3
+import os
4
+import sys
5
+
6
+conf = {
7
+    'max_domains': 10,
8
+    'max_ext_length': 10,
9
+    'style': 'gitstats.css',
10
+    'max_authors': 20,
11
+    'authors_top': 5,
12
+    'commit_begin': '',
13
+    'commit_end': 'HEAD',
14
+    'linear_linestats': 1,
15
+    'project_name': '',
16
+    'processes': 8,
17
+    'start_date': '',
18
+    'logging': logging.INFO,
19
+}
20
+
21
+
22
+def _usage():
23
+    print(f"""
24
+Usage: gitstats [options] <gitpath..> <outputpath>
25
+
26
+Options:
27
+-c key=value     Override configuration value
28
+-n key=value     Define author name equivalency (key will treated the same as value)
29
+
30
+Default config values:
31
+{conf}
32
+
33
+Please see the manual page for more details.
34
+""")
35
+
36
+
37
+def get_cli():
38
+    optlist, args = getopt.getopt(sys.argv[1:], 'hc:', ["help"])
39
+    for o, v in optlist:
40
+        if o == '-c':
41
+            key, value = v.split('=', 1)
42
+            if key not in conf:
43
+                raise KeyError('no such key "%s" in config' % key)
44
+            if isinstance(conf[key], int):
45
+                conf[key] = int(value)
46
+            else:
47
+                conf[key] = value
48
+        elif o in ('-h', '--help'):
49
+            _usage()
50
+            sys.exit()
51
+
52
+    if len(args) < 2:
53
+        _usage()
54
+        sys.exit(0)
55
+
56
+    outputpath = os.path.abspath(args[-1])
57
+    paths = args[0:-1]
58
+    outputpath = os.path.abspath(outputpath)
59
+
60
+    return conf, paths, outputpath

+ 7
- 0
gitstats/data/__init__.py Datei anzeigen

@@ -0,0 +1,7 @@
1
+from .author import Author
2
+from .author_row import AuthorRow
3
+from .author_totals import AuthorTotals
4
+from .tag import Tag
5
+from .revision import Revision
6
+from .file import File
7
+from .loc_by_date import LocByDate

+ 10
- 0
gitstats/data/author_row.py Datei anzeigen

@@ -0,0 +1,10 @@
1
+from dataclasses import dataclass
2
+
3
+@dataclass
4
+class AuthorRow:
5
+    sha: str
6
+    stamp: int
7
+    author: str
8
+    files_modified: int
9
+    lines_inserted: int
10
+    lines_deleted: int

+ 6
- 0
gitstats/data/author_totals.py Datei anzeigen

@@ -0,0 +1,6 @@
1
+from dataclasses import dataclass
2
+
3
+@dataclass
4
+class AuthorTotals:
5
+    author: str
6
+    total_commits: int

+ 10
- 0
gitstats/data/file.py Datei anzeigen

@@ -0,0 +1,10 @@
1
+from dataclasses import dataclass
2
+
3
+@dataclass
4
+class File:
5
+    full_path: str
6
+    ext: str
7
+    size: int
8
+    lines: int = 0
9
+
10
+

+ 13
- 0
gitstats/data/loc_by_date.py Datei anzeigen

@@ -0,0 +1,13 @@
1
+from dataclasses import dataclass
2
+
3
+
4
+@dataclass
5
+class LocByDate:
6
+    hash: str = ''
7
+    stamp: int = 0
8
+    file_count: int = 0
9
+    lines_inserted: int = 0
10
+    lines_deleted: int = 0
11
+    total_lines: int = 0
12
+
13
+

+ 16
- 0
gitstats/data/revision.py Datei anzeigen

@@ -0,0 +1,16 @@
1
+from collections import defaultdict
2
+from dataclasses import dataclass, field
3
+from typing import Dict
4
+
5
+#    # Outputs "<stamp> <date> <time> <timezone> <author> '<' <mail> '>'"
6
+
7
+@dataclass
8
+class Revision:
9
+    sha: str
10
+    stamp: int
11
+    timezone: int = 0
12
+    author: str = ''
13
+    email: str = ''
14
+    domain: str = ''
15
+    file_count: int = 0
16
+

+ 13
- 0
gitstats/data/tag.py Datei anzeigen

@@ -0,0 +1,13 @@
1
+from collections import defaultdict
2
+from dataclasses import dataclass, field
3
+from typing import Dict
4
+
5
+@dataclass
6
+class Tag:
7
+    tag: str
8
+    stamp: int
9
+    hash: str
10
+    commits: int = 0
11
+    authors: Dict[str, int] = field(default_factory=defaultdict(int))
12
+
13
+

+ 6
- 0
gitstats/data_generators/__init__.py Datei anzeigen

@@ -0,0 +1,6 @@
1
+from .gen_author_data import gen_author_data
2
+from .gen_author_totals import gen_author_totals_data
3
+from .gen_tag_data import gen_tag_data
4
+from .gen_revision_data import gen_revision_data
5
+from .gen_file_data import gen_file_data
6
+from .gen_loc_data import gen_loc_data

+ 88
- 0
gitstats/data_generators/gen_author_data.py Datei anzeigen

@@ -0,0 +1,88 @@
1
+import csv
2
+import logging
3
+import os
4
+import re
5
+
6
+from gitstats import cli, cd
7
+from gitstats.miscfuncs import getlogrange, getpipeoutput, getstatsummarycounts
8
+from gitstats.data import AuthorRow
9
+
10
+
11
+def gen_author_data(conf, row_processor):
12
+    '''
13
+    Given a configuration, pull authorship information. For
14
+    each author, callback to the row_processor passing an AuthorRow
15
+
16
+    :param conf: configuration (mostly used for date limits)
17
+    :param row_processor: function to receive the callback
18
+    :return: None
19
+    '''
20
+
21
+    # DBG: git log --shortstat --date-order --pretty=format:"%H %at %aN" --since="2017-10-01" "HEAD"
22
+    # Results are in the form of
23
+    #
24
+    # 3c16756701d264619db0b309f42ebdc713b29827 1522513256 Dan Rapp
25
+    # 524ee0d32ffbbb8bb82966b769bbf7dbc1d87a68 1522480979 Michael Wright
26
+    # 1 file changed, 6 insertions(+)
27
+    #
28
+    # If there are two (or more) lines,
29
+    # The first line(s) is the merge to master or other branch
30
+    # The last line is the commit on the branch
31
+    lines = getpipeoutput(
32
+        ['git log --shortstat --date-order --pretty=format:"%%H %%at %%aN" %s' % (
33
+            getlogrange(conf, 'HEAD'))]).split('\n')
34
+    lines.reverse()
35
+
36
+    files = 0
37
+    inserted = 0
38
+    deleted = 0
39
+    stamp = 0
40
+    for line in lines:
41
+        if len(line) == 0:
42
+            continue
43
+
44
+        # <stamp> <author>
45
+        if re.search('files? changed', line) is None:
46
+            if files + inserted + deleted > 0:  # this case indicates we've already processed the line
47
+                pos = line.find(' ')
48
+                if pos != -1:
49
+                    try:
50
+                        oldstamp = stamp
51
+                        tokens = line.split()
52
+                        sha = tokens[0]
53
+                        stamp = int(tokens[1])
54
+                        author = ' '.join(tokens[2:])
55
+                        if oldstamp > stamp:
56
+                            # clock skew, keep old timestamp to avoid having ugly graph
57
+                            stamp = oldstamp
58
+                        row_processor(AuthorRow(sha, stamp, author, files, inserted, deleted))
59
+                        # Since subsequent lines are (generally) reflections of merging into a branch
60
+                        # don't provide "credit" to the author did the merge
61
+                        (files, inserted, deleted) = 0, 0, 0
62
+                    except ValueError:
63
+                        logging.warning(f'unexpected line "{line}')
64
+                else:
65
+                    logging.warning(f'unexpected line "{line}')
66
+        else:
67
+            numbers = getstatsummarycounts(line)
68
+
69
+            if len(numbers) == 3:
70
+                (files, inserted, deleted) = map(lambda el: int(el), numbers)
71
+            else:
72
+                logging.warning(f'Failed to handle line "{line}"')
73
+                (files, inserted, deleted) = (0, 0, 0)
74
+
75
+if __name__ == "__main__":
76
+    conf, paths, outputpath = cli.get_cli()
77
+    with open(outputpath, 'w', encoding='utf8') as f:
78
+        writer = csv.writer(f)
79
+        writer.writerow(['repo', 'sha', 'stamp', 'author', 'files changed', 'lines inserted', 'lines deleted'])
80
+
81
+        for path in paths:
82
+            repo_name = os.path.split(path)[1]
83
+            with (cd.cd(path)):
84
+
85
+                gen_author_data(
86
+                    conf,
87
+                    lambda row: writer.writerow([repo_name, row.sha, row.stamp, row.author, row.files_modified,
88
+                                                 row.lines_inserted, row.lines_deleted]))

+ 47
- 0
gitstats/data_generators/gen_author_totals.py Datei anzeigen

@@ -0,0 +1,47 @@
1
+import csv
2
+import os
3
+
4
+from gitstats import cli, cd
5
+from gitstats.miscfuncs import getlogrange, getpipeoutput
6
+from gitstats.data import AuthorTotals
7
+
8
+
9
+def gen_author_totals_data(conf, row_processor=None, revision_range=None):
10
+    '''
11
+    Given configuration, pull total commit per author. For
12
+    each "row" callback to the row_processor passing an AuthorTotals
13
+
14
+    :param conf: configuration (mostly used for date limits)
15
+    :param row_processor: function to receive the callback
16
+    :return: count of the number of authors
17
+    '''
18
+
19
+    # DBG: git shortlog -s --since="2017-10-01" "HEAD"
20
+    if not revision_range:
21
+        revision_range = getlogrange(conf)
22
+    lines = getpipeoutput(['git shortlog -s %s' % revision_range]).split('\n')
23
+    count = 0
24
+    for line in lines:
25
+        line = line.strip()
26
+        if not line:
27
+            continue
28
+        count += 1
29
+        if row_processor:
30
+            tokens = line.split()
31
+            commit_count = int(tokens[0])
32
+            author = ' '.join(tokens[1:])
33
+            row_processor(AuthorTotals(author, commit_count))
34
+    return count
35
+
36
+if __name__ == "__main__":
37
+    conf, paths, outputpath = cli.get_cli()
38
+    with open(outputpath, 'w', encoding='utf8') as f:
39
+        writer = csv.writer(f)
40
+        writer.writerow(['repo', 'author', 'commits'])
41
+
42
+        for path in paths:
43
+            repo_name = os.path.split(path)[1]
44
+            with (cd.cd(path)):
45
+                gen_author_totals_data(
46
+                    conf,
47
+                    lambda row: writer.writerow([repo_name, row.author, row.total_commits]))

+ 65
- 0
gitstats/data_generators/gen_file_data.py Datei anzeigen

@@ -0,0 +1,65 @@
1
+import csv
2
+import os
3
+import re
4
+
5
+from multiprocessing import Pool
6
+
7
+from gitstats import cli, cd
8
+from gitstats.miscfuncs import getcommitrange, getpipeoutput, getnumoflinesinblob
9
+from gitstats.data import File
10
+
11
+
12
+def gen_file_data(conf, row_processor):
13
+    '''
14
+    Given a configuration, pull authorship information. For
15
+    each author, callback to the row_processor passing an AuthorRow
16
+
17
+    :param conf: configuration (mostly used for date limits)
18
+    :param row_processor: function to receive the callback
19
+    :return: None
20
+    '''
21
+
22
+    # extensions and size of files
23
+
24
+    # DBG: git ls-tree -r -l -z HEAD
25
+    lines = getpipeoutput(['git ls-tree -r -l -z %s' % getcommitrange(conf, 'HEAD', end_only=True)]).split(
26
+        '\000')
27
+    blobs_to_read = {} # blob_id -> File
28
+    for line in lines:
29
+        if len(line) == 0:
30
+            continue
31
+        parts = re.split('\s+', line, 4)
32
+        if parts[0] == '160000' and parts[3] == '-':
33
+            # skip submodules
34
+            continue
35
+        blob_id = parts[2]
36
+        size = int(parts[3])
37
+        fullpath = parts[4]
38
+        _, ext = os.path.splitext(fullpath)
39
+        blobs_to_read[blob_id] = File(fullpath, ext, size)
40
+
41
+    # DBG: git cat-file blob e4f17a621893811250be96c8ef9c37b5e97a1df7', 'wc -l'
42
+    pool = Pool(processes=conf['processes'])
43
+    blob_linecount = pool.map(getnumoflinesinblob, blobs_to_read.keys())
44
+    pool.terminate()
45
+    pool.join()
46
+    # Update cache and write down info about number of number of lines
47
+    for (blob_id, linecount) in blob_linecount:
48
+        file_data = blobs_to_read[blob_id]
49
+        file_data.lines = linecount
50
+        row_processor(file_data)
51
+
52
+
53
+if __name__ == "__main__":
54
+    conf, paths, outputpath = cli.get_cli()
55
+    with open(outputpath, 'w', encoding='utf8') as f:
56
+        writer = csv.writer(f)
57
+        writer.writerow(['repo', 'file', 'ext', 'size', 'line_count'])
58
+
59
+        for path in paths:
60
+            repo_name = os.path.split(path)[1]
61
+            with (cd.cd(path)):
62
+
63
+                gen_file_data(
64
+                    conf,
65
+                    lambda row: writer.writerow([repo_name, row.full_path, row.ext, row.size, row.lines]))

+ 80
- 0
gitstats/data_generators/gen_loc_data.py Datei anzeigen

@@ -0,0 +1,80 @@
1
+import csv
2
+import logging
3
+import os
4
+import re
5
+
6
+from gitstats import cli, cd
7
+from gitstats.miscfuncs import getlogrange, getpipeoutput, getstatsummarycounts
8
+from gitstats.data import LocByDate
9
+
10
+
11
+def gen_loc_data(conf, row_processor):
12
+    '''
13
+    Given a configuration, pull authorship information. For
14
+    each author, callback to the row_processor passing an AuthorRow
15
+
16
+    :param conf: configuration (mostly used for date limits)
17
+    :param row_processor: function to receive the callback
18
+    :return: total lines in repo
19
+    '''
20
+
21
+    # line statistics
22
+    # outputs:
23
+    #  N files changed, N insertions (+), N deletions(-)
24
+
25
+    # computation of lines of code by date is better done
26
+    # on a linear history.
27
+    extra = ''
28
+    if conf['linear_linestats']:
29
+        extra = '--first-parent -m'
30
+
31
+    # DBG: git log --shortstat --first-parent -m --pretty=format:"%at %aN" --since="2017-10-01" "HEAD"'
32
+    lines = getpipeoutput(
33
+        ['git log --shortstat %s --pretty=format:"%%H %%at %%aN" %s' % (extra, getlogrange(conf, 'HEAD'))]).split('\n')
34
+    lines.reverse()
35
+    files = 0
36
+    inserted = 0
37
+    deleted = 0
38
+    total_lines = 0
39
+    for line in lines:
40
+        if len(line) == 0:
41
+            continue
42
+
43
+        if re.search('files? changed', line) is None:
44
+            line = line.strip()
45
+            if line:
46
+                try:
47
+                    parts = line.split(' ', 2)
48
+                    (hash, stamp, author) = (parts[0], int(parts[1]), parts[2])
49
+                    row_processor(LocByDate(hash, stamp, files, inserted, deleted, total_lines))
50
+                    files, inserted, deleted = 0, 0, 0
51
+                except ValueError:
52
+                    logging.warning(f'unexpected line "{line}')
53
+            else:
54
+                logging.warning(f'unexpected line "{line}')
55
+        else:
56
+            numbers = getstatsummarycounts(line)
57
+
58
+            if len(numbers) == 3:
59
+                (files, inserted, deleted) = map(lambda el: int(el), numbers)
60
+                total_lines += inserted
61
+                total_lines -= deleted
62
+            else:
63
+                logging.warning(f'Failed to handle line "{line}"')
64
+                (files, inserted, deleted) = (0, 0, 0)
65
+    return total_lines
66
+
67
+
68
+if __name__ == "__main__":
69
+    conf, paths, outputpath = cli.get_cli()
70
+    with open(outputpath, 'w', encoding='utf8') as f:
71
+        writer = csv.writer(f)
72
+        writer.writerow(['repo', 'sha', 'stamp', 'file count', 'lines inserted', 'lines deleted', 'total lines'])
73
+
74
+        for path in paths:
75
+            repo_name = os.path.split(path)[1]
76
+            with (cd.cd(path)):
77
+
78
+                gen_loc_data(
79
+                    conf,
80
+                    lambda row: writer.writerow([repo_name, row.hash, row.stamp, row.file_count, row.lines_inserted, row.lines_deleted, row.total_lines]))

+ 75
- 0
gitstats/data_generators/gen_revision_data.py Datei anzeigen

@@ -0,0 +1,75 @@
1
+import csv
2
+import os
3
+
4
+from multiprocessing import Pool
5
+
6
+from gitstats import cli, cd
7
+from gitstats.miscfuncs import getlogrange, getpipeoutput, getnumoffilesfromrev
8
+from gitstats.data import Revision
9
+
10
+
11
+def gen_revision_data(conf, row_processor):
12
+    '''
13
+    Given a configuration, pull revision information. For
14
+    each author, callback to the row_processor passing an Revision
15
+
16
+    :param conf: configuration (mostly used for date limits)
17
+    :param row_processor: function to receive the callback
18
+    :return: Number of commits
19
+    '''
20
+
21
+    revisions = {} # tree_hash -> Revision
22
+    # Collect revision statistics
23
+    # Outputs "<stamp> <date> <time> <timezone> <author> '<' <mail> '>'"
24
+
25
+    # DBG: git rev-list --pretty=format:"%at %ai %aN <%aE>" --since="2017-10-01" "HEAD"', 'grep -v ^commit'
26
+    lines = getpipeoutput(
27
+        ['git rev-list --pretty=format:"%%T %%H %%at %%ai %%aN <%%aE>" %s' % getlogrange(conf, 'HEAD'),
28
+         'grep -v ^commit']).split('\n')
29
+    for line in lines:
30
+        parts = line.split(' ', 6)
31
+        tree_hash = parts[0]
32
+        sha = parts[1]
33
+        try:
34
+            stamp = int(parts[2])
35
+        except ValueError:
36
+            stamp = 0
37
+        timezone = parts[5]
38
+        author, mail = parts[6].split('<', 1)
39
+        author = author.strip()
40
+        mail = mail.rstrip('>')
41
+        domain = '?'
42
+        if mail.find('@') != -1:
43
+            domain = mail.rsplit('@', 1)[1]
44
+            domain.rstrip('>')
45
+        revisions[tree_hash] = Revision(sha, stamp, timezone, author, mail, domain)
46
+
47
+    # todo: consider putting in a cache for this. There was one in the original code
48
+    # DBG: git ls-tree -r --name-only "ceb3165b51ae0680724fd71e16a5ff836a0de41e"', 'wc -l'
49
+    pool = Pool(processes=conf['processes'])
50
+    rev_count = pool.map(getnumoffilesfromrev, revisions.keys())
51
+    pool.terminate()
52
+    pool.join()
53
+    # Update cache with new revisions and append then to general list
54
+    for (rev, count) in rev_count:
55
+        revision = revisions[rev]
56
+        revision.file_count = count
57
+        row_processor(revision)
58
+
59
+    return len(lines)
60
+
61
+
62
+if __name__ == "__main__":
63
+    conf, paths, outputpath = cli.get_cli()
64
+    with open(outputpath, 'w', encoding='utf8') as f:
65
+        writer = csv.writer(f)
66
+        writer.writerow(['repo', 'sha', 'stamp', 'timezone', 'author', 'email', 'domain', 'files_changed'])
67
+
68
+        for path in paths:
69
+            repo_name = os.path.split(path)[1]
70
+            with (cd.cd(path)):
71
+
72
+                gen_revision_data(
73
+                    conf,
74
+                    lambda row: writer.writerow([repo_name, row.sha, row.stamp, row.timezone, row.author, row.email,
75
+                                                 row.domain, row.file_count]))

+ 74
- 0
gitstats/data_generators/gen_tag_data.py Datei anzeigen

@@ -0,0 +1,74 @@
1
+import csv
2
+import os
3
+
4
+from collections import defaultdict
5
+from typing import Dict
6
+
7
+from gitstats import cli, cd
8
+from gitstats.miscfuncs import getpipeoutput
9
+from gitstats.data import AuthorTotals, Tag
10
+from gitstats.data_generators import gen_author_totals_data
11
+
12
+
13
+def gen_tag_data(conf, row_processor):
14
+    '''
15
+    Given a configuration, pull tag information. For
16
+    each tag, callback to the row_processor passing a Tag
17
+
18
+    :param conf: configuration (mostly used for date limits)
19
+    :param row_processor: function to receive the callback
20
+    :return: None
21
+    '''
22
+
23
+    # tags
24
+    tags = {} # stamp -> tags
25
+    lines = getpipeoutput(['git show-ref --tags']).split('\n')
26
+    for line in lines:
27
+        if len(line) == 0:
28
+            continue
29
+        (line_hash, tag) = line.split(' ')
30
+
31
+        tag = tag.replace('refs/tags/', '')
32
+        output = getpipeoutput(['git log "%s" --pretty=format:"%%at" -n 1' % line_hash])
33
+
34
+        stamp = 0
35
+        if len(output) > 0:
36
+            try:
37
+                stamp = int(output.strip())
38
+            except ValueError:
39
+                stamp = 0
40
+
41
+        tags[stamp] = Tag(tag, stamp, line_hash, 0, {})
42
+
43
+    stamps = sorted(tags.keys())
44
+    prev = None
45
+    for stamp in stamps:
46
+        def process_row(row: AuthorTotals):
47
+            tags[stamp].authors[row.author] = row.total_commits
48
+            tags[stamp].commits += row.total_commits
49
+
50
+        revision_tags = tags[stamp].tag
51
+        if prev != None:
52
+            revision_tags += ' "^%s"' % prev
53
+
54
+        gen_author_totals_data(conf, process_row, revision_tags)
55
+        row_processor(tags[stamp])
56
+
57
+        prev = tags[stamp].tag
58
+
59
+
60
+if __name__ == "__main__":
61
+    conf, paths, outputpath = cli.get_cli()
62
+    with open(outputpath, 'w', encoding='utf8') as f:
63
+        writer = csv.writer(f)
64
+        writer.writerow(['repo', 'sha', 'stamp', 'commits for tag', 'author', 'commits by author'])
65
+
66
+        for path in paths:
67
+            repo_name = os.path.split(path)[1]
68
+            with (cd.cd(path)):
69
+
70
+                def process_row(row):
71
+                    for author, commits in row.authors.items():
72
+                        writer.writerow([repo_name, row.hash, row.stamp, row.commits, author, commits])
73
+
74
+                gen_tag_data(conf, process_row)

+ 2
- 1
gitstats/datacollector.py Datei anzeigen

@@ -5,6 +5,7 @@ import pickle
5 5
 import time
6 6
 import zlib
7 7
 
8
+from collections import defaultdict
8 9
 from typing import Dict
9 10
 from gitstats.data.author import Author
10 11
 
@@ -26,7 +27,7 @@ class DataCollector:
26 27
         self.activity_by_year_week = {}  # yy_wNN -> commits
27 28
         self.activity_by_year_week_peak = 0
28 29
 
29
-        self.authors: Dict[Author] = {}  # name -> Author
30
+        self.authors: Dict[Author] = defaultdict(lambda: Author())
30 31
 
31 32
         self.total_commits = 0
32 33
         self.total_files = 0

+ 69
- 260
gitstats/gitdatacollector.py Datei anzeigen

@@ -1,14 +1,12 @@
1 1
 import datetime
2
-import logging
3
-import re
4
-import os
5 2
 
6
-from multiprocessing import Pool
3
+from collections import defaultdict
7 4
 
8
-from gitstats.data.author import Author
9 5
 from gitstats.datacollector import DataCollector
10
-from gitstats.miscfuncs import getcommitrange, getlogrange, getnumoffilesfromrev, getnumoflinesinblob, \
11
-    getpipeoutput, getstatsummarycounts
6
+from gitstats.data import Author, AuthorRow, File, LocByDate, Revision, Tag
7
+from gitstats.data_generators import gen_author_data, gen_author_totals_data, gen_tag_data, gen_revision_data, \
8
+    gen_file_data, gen_loc_data
9
+from gitstats.miscfuncs import getpipeoutput
12 10
 
13 11
 
14 12
 class GitDataCollector(DataCollector):
@@ -18,201 +16,77 @@ class GitDataCollector(DataCollector):
18 16
     def collect(self, directory):
19 17
         super(GitDataCollector, self).collect(directory)
20 18
 
21
-        self.total_authors += int(getpipeoutput(['git shortlog -s %s' % getlogrange(self.conf), 'wc -l']))
22
-        # self.total_lines = int(getoutput('git-ls-files -z |xargs -0 cat |wc -l'))
19
+        self.total_authors += self.get_total_authors()
23 20
         self.get_tags()
24 21
         self.get_revision_info()
25 22
         self.get_file_info()
26 23
         self.get_loc_info()
27 24
         self.get_author_info()
28 25
 
29
-    def xlate(self, name):
30
-        if name in self.conf['name_xlate']:
31
-            return self.conf['name_xlate'][name]
32
-        return name
26
+    def get_total_authors(self):
27
+        return gen_author_totals_data(self.conf)
33 28
 
34 29
     def get_author_info(self):
35 30
         # Per-author statistics
36 31
         # defined for stamp, author only if author commited at this timestamp.
37
-        self.changes_by_date_by_author = {}  # stamp -> author -> lines_added
38
-        # Similar to the above, but never use --first-parent
39
-        # (we need to walk through every commit to know who
40
-        # committed what, not just through mainline)
41
-        lines = getpipeoutput(
42
-            ['git log --shortstat --date-order --pretty=format:"%%at %%aN" %s' % (
43
-                getlogrange(self.conf, 'HEAD'))]).split('\n')
44
-        lines.reverse()
45
-        inserted = 0
46
-        deleted = 0
47
-        stamp = 0
48
-        for line in lines:
49
-            if len(line) == 0:
50
-                continue
51
-
52
-            # <stamp> <author>
53
-            if re.search('files? changed', line) is None:
54
-                pos = line.find(' ')
55
-                if pos != -1:
56
-                    try:
57
-                        oldstamp = stamp
58
-                        (stamp, author) = (int(line[:pos]), line[pos + 1:])
59
-                        author = self.xlate(author)
60
-                        if oldstamp > stamp:
61
-                            # clock skew, keep old timestamp to avoid having ugly graph
62
-                            stamp = oldstamp
63
-                        if author not in self.authors:
64
-                            self.authors[author] = Author()
65
-                        self.authors[author].commits += 1
66
-                        self.authors[author].lines_added += inserted
67
-                        self.authors[author].lines_removed += deleted
68
-                        if stamp not in self.changes_by_date_by_author:
69
-                            self.changes_by_date_by_author[stamp] = {}
70
-                        if author not in self.changes_by_date_by_author[stamp]:
71
-                            self.changes_by_date_by_author[stamp][author] = Author()
72
-                        self.changes_by_date_by_author[stamp][author].lines_added = self.authors[author].lines_added
73
-                        self.changes_by_date_by_author[stamp][author].commits = self.authors[author].commits
74
-                        files, inserted, deleted = 0, 0, 0
75
-                    except ValueError:
76
-                        logging.warning(f'unexpected line "{line}')
77
-                else:
78
-                    logging.warning(f'unexpected line "{line}')
79
-            else:
80
-                numbers = getstatsummarycounts(line)
81 32
 
82
-                if len(numbers) == 3:
83
-                    (files, inserted, deleted) = map(lambda el: int(el), numbers)
84
-                else:
85
-                    logging.warning(f'Failed to handle line "{line}"')
86
-                    (files, inserted, deleted) = (0, 0, 0)
33
+        self.changes_by_date_by_author = defaultdict(lambda: defaultdict(lambda: Author())) # stamp -> author -> lines_added
34
+
35
+        def row_processor(row: AuthorRow):
36
+            self.authors[row.author].commits += 1
37
+            self.authors[row.author].lines_added += row.lines_inserted
38
+            self.authors[row.author].lines_removed += row.lines_deleted
39
+            self.changes_by_date_by_author[row.stamp][row.author].lines_added = self.authors[row.author].lines_added
40
+            self.changes_by_date_by_author[row.stamp][row.author].commits = self.authors[row.author].commits
41
+
42
+        gen_author_data(self.conf, row_processor)
87 43
 
88 44
     def get_loc_info(self):
89
-        # line statistics
90
-        # outputs:
91
-        #  N files changed, N insertions (+), N deletions(-)
92
-        # <stamp> <author>
45
+
93 46
         self.changes_by_date = {}  # stamp -> { files, ins, del }
94
-        # computation of lines of code by date is better done
95
-        # on a linear history.
96
-        extra = ''
97
-        if self.conf['linear_linestats']:
98
-            extra = '--first-parent -m'
99
-        lines = getpipeoutput(
100
-            ['git log --shortstat %s --pretty=format:"%%at %%aN" %s' % (extra, getlogrange(self.conf, 'HEAD'))]).split(
101
-            '\n')
102
-        lines.reverse()
103
-        files = 0
104
-        inserted = 0
105
-        deleted = 0
106
-        total_lines = 0
107
-        for line in lines:
108
-            if len(line) == 0:
109
-                continue
110
-
111
-            # <stamp> <author>
112
-            if re.search('files? changed', line) is None:
113
-                pos = line.find(' ')
114
-                if pos != -1:
115
-                    try:
116
-                        (stamp, author) = (int(line[:pos]), line[pos + 1:])
117
-                        self.changes_by_date[stamp] = {'files': files, 'ins': inserted, 'del': deleted,
118
-                                                       'lines': total_lines}
119
-
120
-                        date = datetime.datetime.fromtimestamp(stamp)
121
-                        yymm = date.strftime('%Y-%m')
122
-                        self.lines_added_by_month[yymm] = self.lines_added_by_month.get(yymm, 0) + inserted
123
-                        self.lines_removed_by_month[yymm] = self.lines_removed_by_month.get(yymm, 0) + deleted
124
-
125
-                        yy = date.year
126
-                        self.lines_added_by_year[yy] = self.lines_added_by_year.get(yy, 0) + inserted
127
-                        self.lines_removed_by_year[yy] = self.lines_removed_by_year.get(yy, 0) + deleted
128
-
129
-                        files, inserted, deleted = 0, 0, 0
130
-                    except ValueError:
131
-                        logging.warning(f'unexpected line "{line}')
132
-                else:
133
-                    logging.warning(f'unexpected line "{line}')
134
-            else:
135
-                numbers = getstatsummarycounts(line)
47
+        def row_processor(row: LocByDate):
48
+            self.changes_by_date[row.stamp] = {
49
+                'files': row.file_count,
50
+                'ins': row.lines_inserted,
51
+                'del': row.lines_deleted,
52
+                'lines': row.total_lines
53
+            }
54
+            date = datetime.datetime.fromtimestamp(row.stamp)
55
+            yymm = date.strftime('%Y-%m')
56
+            self.lines_added_by_month[yymm] = self.lines_added_by_month.get(yymm, 0) + row.lines_inserted
57
+            self.lines_removed_by_month[yymm] = self.lines_removed_by_month.get(yymm, 0) + row.lines_deleted
58
+
59
+            yy = date.year
60
+            self.lines_added_by_year[yy] = self.lines_added_by_year.get(yy, 0) + row.lines_inserted
61
+            self.lines_removed_by_year[yy] = self.lines_removed_by_year.get(yy, 0) + row.lines_deleted
136 62
 
137
-                if len(numbers) == 3:
138
-                    (files, inserted, deleted) = map(lambda el: int(el), numbers)
139
-                    total_lines += inserted
140
-                    total_lines -= deleted
141
-                    self.total_lines_added += inserted
142
-                    self.total_lines_removed += deleted
63
+            self.total_lines_added += row.lines_inserted
64
+            self.total_lines_removed += row.lines_deleted
143 65
 
144
-                else:
145
-                    logging.warning(f'Failed to handle line "{line}"')
146
-                    (files, inserted, deleted) = (0, 0, 0)
147
-            # self.changes_by_date[stamp] = { 'files': files, 'ins': inserted, 'del': deleted }
148
-        self.total_lines += total_lines
66
+        self.total_lines += gen_loc_data(self.conf, row_processor)
149 67
 
150 68
     def get_file_info(self):
151 69
         # extensions and size of files
152
-        lines = getpipeoutput(['git ls-tree -r -l -z %s' % getcommitrange(self.conf, 'HEAD', end_only=True)]).split(
153
-            '\000')
154
-        blobs_to_read = []
155
-        for line in lines:
156
-            if len(line) == 0:
157
-                continue
158
-            parts = re.split('\s+', line, 4)
159
-            if parts[0] == '160000' and parts[3] == '-':
160
-                # skip submodules
161
-                continue
162
-            blob_id = parts[2]
163
-            size = int(parts[3])
164
-            fullpath = parts[4]
165
-
166
-            self.total_size += size
70
+        def row_processor(row: File):
71
+            self.total_size += row.size
167 72
             self.total_files += 1
73
+            if row.ext not in self.extensions:
74
+                self.extensions[row.ext] = {'files': 0, 'lines': 0}
75
+            self.extensions[row.ext]['files'] += 1
76
+            self.extensions[row.ext]['lines'] += row.lines
168 77
 
169
-            _, ext = os.path.splitext(fullpath)
170
-            if len(ext) > self.conf['max_ext_length']:
171
-                ext = ''
172
-            if ext not in self.extensions:
173
-                self.extensions[ext] = {'files': 0, 'lines': 0}
174
-            self.extensions[ext]['files'] += 1
175
-            # if cache empty then add ext and blob id to list of new blob's
176
-            # otherwise try to read needed info from cache
177
-            if 'lines_in_blob' not in self.cache.keys():
178
-                blobs_to_read.append((ext, blob_id))
179
-                continue
180
-            if blob_id in self.cache['lines_in_blob'].keys():
181
-                self.extensions[ext]['lines'] += self.cache['lines_in_blob'][blob_id]
182
-            else:
183
-                blobs_to_read.append((ext, blob_id))
184
-        # Get info abount line count for new blob's that wasn't found in cache
185
-        pool = Pool(processes=self.conf['processes'])
186
-        ext_blob_linecount = pool.map(getnumoflinesinblob, blobs_to_read)
187
-        pool.terminate()
188
-        pool.join()
189
-        # Update cache and write down info about number of number of lines
190
-        for (ext, blob_id, linecount) in ext_blob_linecount:
191
-            if 'lines_in_blob' not in self.cache:
192
-                self.cache['lines_in_blob'] = {}
193
-            self.cache['lines_in_blob'][blob_id] = linecount
194
-            self.extensions[ext]['lines'] += self.cache['lines_in_blob'][blob_id]
78
+        gen_file_data(self.conf, row_processor)
195 79
 
196 80
     def get_revision_info(self):
197 81
         # Collect revision statistics
198 82
         # Outputs "<stamp> <date> <time> <timezone> <author> '<' <mail> '>'"
199
-        lines = getpipeoutput(
200
-            ['git rev-list --pretty=format:"%%at %%ai %%aN <%%aE>" %s' % getlogrange(self.conf, 'HEAD'),
201
-             'grep -v ^commit']).split(
202
-            '\n')
203
-        for line in lines:
204
-            parts = line.split(' ', 4)
205
-            try:
206
-                stamp = int(parts[0])
207
-            except ValueError:
208
-                stamp = 0
209
-            timezone = parts[3]
210
-            author, mail = parts[4].split('<', 1)
211
-            author = self.xlate(author.rstrip())
212
-            mail = mail.rstrip('>')
213
-            domain = '?'
214
-            if mail.find('@') != -1:
215
-                domain = mail.rsplit('@', 1)[1]
83
+
84
+        def row_processor(row: Revision):
85
+            stamp = row.stamp
86
+            domain = row.domain
87
+            author = row.author
88
+            timezone = row.timezone
89
+
216 90
             date = datetime.datetime.fromtimestamp(float(stamp))
217 91
 
218 92
             # First and last commit stamp (may be in any order because of cherry-picking and patches)
@@ -258,8 +132,6 @@ class GitDataCollector(DataCollector):
258 132
                 self.activity_by_year_week_peak = self.activity_by_year_week[yyw]
259 133
 
260 134
             # author stats
261
-            if author not in self.authors:
262
-                self.authors[author] = Author()
263 135
             self.authors[author].activity_by_day_and_hour[day][hour] += 1
264 136
             # commits, note again that commits may be in any date order because of cherry-picking and patches
265 137
             if not self.authors[author].last_commit_stamp:
@@ -303,99 +175,36 @@ class GitDataCollector(DataCollector):
303 175
 
304 176
             # timezone
305 177
             self.commits_by_timezone[timezone] = self.commits_by_timezone.get(timezone, 0) + 1
306
-        # outputs "<stamp> <files>" for each revision
307
-        revlines = getpipeoutput(
308
-            ['git rev-list --pretty=format:"%%at %%T" %s' % getlogrange(self.conf, 'HEAD'),
309
-             'grep -v ^commit']).strip().split('\n')
310
-        lines = []
311
-        revs_to_read = []
312
-        # Look up rev in cache and take info from cache if found
313
-        # If not append rev to list of rev to read from repo
314
-        for revline in revlines:
315
-            time, rev = revline.split(' ')
316
-            # if cache empty then add time and rev to list of new rev's
317
-            # otherwise try to read needed info from cache
318
-            if 'files_in_tree' not in self.cache.keys():
319
-                revs_to_read.append((time, rev))
320
-                continue
321
-            if rev in self.cache['files_in_tree'].keys():
322
-                lines.append('%d %d' % (int(time), self.cache['files_in_tree'][rev]))
323
-            else:
324
-                revs_to_read.append((time, rev))
325
-        # Read revisions from repo
326
-        pool = Pool(processes=self.conf['processes'])
327
-        time_rev_count = pool.map(getnumoffilesfromrev, revs_to_read)
328
-        pool.terminate()
329
-        pool.join()
330
-        # Update cache with new revisions and append then to general list
331
-        for (time, rev, count) in time_rev_count:
332
-            if 'files_in_tree' not in self.cache:
333
-                self.cache['files_in_tree'] = {}
334
-            self.cache['files_in_tree'][rev] = count
335
-            lines.append('%d %d' % (int(time), count))
336
-        self.total_commits += len(lines)
337
-        for line in lines:
338
-            parts = line.split(' ')
339
-            if len(parts) != 2:
340
-                continue
341
-            (stamp, files) = parts[0:2]
342
-            try:
343
-                self.files_by_stamp[int(stamp)] = int(files)
344
-            except ValueError:
345
-                logging.warning(f'Failed to parse line "{line}"')
178
+
179
+            # file counts
180
+            self.files_by_stamp[stamp] = row.file_count
181
+
182
+        self.total_commits += gen_revision_data(self.conf, row_processor)
346 183
 
347 184
     def get_tags(self):
348
-        # tags
349
-        lines = getpipeoutput(['git show-ref --tags']).split('\n')
350
-        for line in lines:
351
-            if len(line) == 0:
352
-                continue
353
-            (line_hash, tag) = line.split(' ')
354
-
355
-            tag = tag.replace('refs/tags/', '')
356
-            output = getpipeoutput(['git log "%s" --pretty=format:"%%at %%aN" -n 1' % line_hash])
357
-            if len(output) > 0:
358
-                parts = output.split(' ')
359
-                try:
360
-                    stamp = int(parts[0])
361
-                except ValueError:
362
-                    stamp = 0
363
-                self.tags[tag] = {'stamp': stamp,
364
-                                  'hash': line_hash,
365
-                                  'date': datetime.datetime.fromtimestamp(stamp).strftime('%Y-%m-%d'),
366
-                                  'commits': 0,
367
-                                  'authors': {}}
368
-        # collect info on tags, starting from latest
369
-        tags_sorted_by_date_asc = [tup[1] for tup in sorted([(el[1]['date'], el[0]) for el in self.tags.items()])]
370
-        # tags_sorted_by_date_desc = map(lambda el: el[1],
371
-        #                                reversed(sorted(map(lambda el: (el[1]['date'], el[0]), self.tags.items()))))
372
-        prev = None
373
-        #        for tag in reversed(tags_sorted_by_date_desc):
374
-        for tag in tags_sorted_by_date_asc:
375
-            cmd = 'git shortlog -s "%s"' % tag
376
-            if prev is not None:
377
-                cmd += ' "^%s"' % prev
378
-            output = getpipeoutput([cmd])
379
-            if len(output) == 0:
380
-                continue
381
-            prev = tag
382
-            for line in output.split('\n'):
383
-                parts = re.split('\s+', line, 2)
384
-                commits = int(parts[1])
385
-                author = parts[2]
386
-                self.tags[tag]['commits'] += commits
387
-                self.tags[tag]['authors'][author] = commits
185
+        def row_processor(row: Tag):
186
+            self.tags[row.tag] = {
187
+                'stamp': row.stamp,
188
+                'hash': row.hash,
189
+                'date': datetime.datetime.fromtimestamp(row.stamp).strftime('%Y-%m-%d'),
190
+                'commits': row.commits,
191
+                'authors': row.authors
192
+            }
193
+
194
+        gen_tag_data(self.conf, row_processor)
388 195
 
389 196
     def refine(self):
390 197
         # authors
391 198
         # name -> {place_by_commits, commits_frac, date_first, date_last, timedelta}
392 199
         self.authors_by_commits = self.getAuthors()
200
+        total_commits_without_merge = 0
393 201
         for i, name in enumerate(self.authors_by_commits):
394 202
             self.authors[name].place_by_commits = i + 1
203
+            total_commits_without_merge += self.authors[name].commits
395 204
 
396 205
         for name in self.authors.keys():
397 206
             a = self.authors[name]
398
-            a.commits_frac = (100 * float(a.commits)) / self.getTotalCommits()
207
+            a.commits_frac = (100 * float(a.commits)) / total_commits_without_merge
399 208
             date_first = datetime.datetime.fromtimestamp(a.first_commit_stamp)
400 209
             date_last = datetime.datetime.fromtimestamp(a.last_commit_stamp)
401 210
             delta = date_last - date_first

+ 0
- 138
gitstats/gitstats.py Datei anzeigen

@@ -1,138 +0,0 @@
1
-#!/usr/bin/python
2
-# Copyright (c) 2007-2014 Heikki Hokkanen <hoxu@users.sf.net> & others (see doc/AUTHOR)
3
-# GPLv2 / GPLv3
4
-import getopt
5
-import logging
6
-import os
7
-import sys
8
-import time
9
-
10
-import multiprocessing_logging
11
-
12
-from collections import defaultdict
13
-
14
-from .gitdatacollector import GitDataCollector
15
-from .htmlreportcreator import HTMLReportCreator
16
-from .miscfuncs import getgnuplotversion
17
-
18
-exectime_internal = 0.0
19
-exectime_external = 0.0
20
-
21
-conf = {
22
-    'max_domains': 10,
23
-    'max_ext_length': 10,
24
-    'style': 'gitstats.css',
25
-    'max_authors': 20,
26
-    'authors_top': 5,
27
-    'commit_begin': '',
28
-    'commit_end': 'HEAD',
29
-    'linear_linestats': 1,
30
-    'project_name': '',
31
-    'processes': 8,
32
-    'start_date': '',
33
-    'logging': logging.INFO,
34
-    'name_xlate': defaultdict(dict)
35
-}
36
-
37
-class GitStats:
38
-    def _usage(self):
39
-        print(f"""
40
-    Usage: gitstats [options] <gitpath..> <outputpath>
41
-
42
-    Options:
43
-    -c key=value     Override configuration value
44
-    -n key=value     Define author name equivalency (key will treated the same as value)
45
-
46
-    Default config values:
47
-    {conf}
48
-
49
-    Please see the manual page for more details.
50
-    """)
51
-
52
-    def run(self):
53
-        optlist, args = getopt.getopt(sys.argv[1:], 'hc:n:', ["help"])
54
-        for o, v in optlist:
55
-            if o == '-c':
56
-                key, value = v.split('=', 1)
57
-                if key not in conf:
58
-                    raise KeyError('no such key "%s" in config' % key)
59
-                if isinstance(conf[key], int):
60
-                    conf[key] = int(value)
61
-                else:
62
-                    conf[key] = value
63
-            elif o in ('-h', '--help'):
64
-                self._usage()
65
-                sys.exit()
66
-            elif o == '-n':
67
-                key, value = v.split('=', 1)
68
-                conf['name_xlate'][key] = value
69
-
70
-
71
-        if len(args) < 2:
72
-            self._usage()
73
-            sys.exit(0)
74
-
75
-        outputpath = os.path.abspath(args[-1])
76
-        paths = args[0:-1]
77
-        outputpath = os.path.abspath(outputpath)
78
-
79
-        logging.basicConfig(level=conf['logging'], format='%(message)s')
80
-        multiprocessing_logging.install_mp_handler()
81
-        time_start = time.time()
82
-
83
-
84
-        rundir = os.getcwd()
85
-
86
-        try:
87
-            os.makedirs(outputpath)
88
-        except OSError:
89
-            pass
90
-        if not os.path.isdir(outputpath):
91
-            logging.fatal('Output path is not a directory or does not exist')
92
-            sys.exit(1)
93
-
94
-        if not getgnuplotversion():
95
-            logging.error('gnuplot not found')
96
-            sys.exit(1)
97
-
98
-        logging.info(f'Output path: {outputpath}')
99
-        cachefile = os.path.join(outputpath, 'gitstats.cache')
100
-
101
-        data = GitDataCollector(conf)
102
-        data.loadCache(cachefile)
103
-
104
-        for gitpath in paths:
105
-            logging.info(f'Git path: {gitpath}')
106
-
107
-            prevdir = os.getcwd()
108
-            os.chdir(gitpath)
109
-
110
-            logging.info('Collecting data...')
111
-            data.collect(gitpath)
112
-
113
-            os.chdir(prevdir)
114
-
115
-        data.saveCache(cachefile)
116
-
117
-        logging.info('Refining data...')
118
-        data.refine()
119
-
120
-        os.chdir(rundir)
121
-
122
-        logging.info('Generating report...')
123
-        report = HTMLReportCreator(conf)
124
-        report.create(data, outputpath)
125
-
126
-        time_end = time.time()
127
-        calculated_exectime_internal = time_end - time_start
128
-        logging.info(f'Execution time {calculated_exectime_internal} secs, {exectime_external} secs ({(100.0 * exectime_external) / calculated_exectime_internal}%) in external commands)')
129
-
130
-        print('You may now run:')
131
-        print()
132
-        print('   sensible-browser \'%s\'' % os.path.join(outputpath, 'index.html').replace("'", "'\\''"))
133
-        print()
134
-
135
-
136
-if __name__ == '__main__':
137
-    g = GitStats()
138
-    g.run()

+ 7
- 6
gitstats/miscfuncs.py Datei anzeigen

@@ -80,19 +80,20 @@ def getgnuplotversion():
80 80
     return getpipeoutput(['%s --version' % gnuplot_cmd]).split('\n')[0]
81 81
 
82 82
 
83
-def getnumoffilesfromrev(time_rev):
83
+def getnumoffilesfromrev(tree_hash):
84 84
     """
85 85
     Get number of files changed in commit
86 86
     """
87
-    time_portion, rev = time_rev
88
-    return (int(time_portion), rev, int(getpipeoutput(['git ls-tree -r --name-only "%s"' % rev, 'wc -l']).split('\n')[0]))
87
+    # DBG: git ls-tree -r --name-only "ceb3165b51ae0680724fd71e16a5ff836a0de41e"' | 'wc -l'
88
+    return (tree_hash, int(getpipeoutput(['git ls-tree -r --name-only "%s"' % tree_hash, 'wc -l']).split('\n')[0]))
89 89
 
90 90
 
91
-def getnumoflinesinblob(ext_blob):
91
+def getnumoflinesinblob(blob_id):
92 92
     """
93 93
     Get number of lines in blob
94 94
     """
95
-    ext, blob_id = ext_blob
96
-    return ext, blob_id, int(getpipeoutput(['git cat-file blob %s' % blob_id, 'wc -l']).split()[0])
95
+
96
+    # DBG: git cat-file blob e4f17a621893811250be96c8ef9c37b5e97a1df7', 'wc -l'
97
+    return blob_id, int(getpipeoutput(['git cat-file blob %s' % blob_id, 'wc -l']).split()[0])
97 98
 
98 99
 

+ 81
- 0
gitstats/process.py Datei anzeigen

@@ -0,0 +1,81 @@
1
+#!/usr/bin/python
2
+# Copyright (c) 2007-2014 Heikki Hokkanen <hoxu@users.sf.net> & others (see doc/AUTHOR)
3
+# GPLv2 / GPLv3
4
+import logging
5
+import os
6
+import sys
7
+import time
8
+
9
+import multiprocessing_logging
10
+
11
+from gitstats.gitdatacollector import GitDataCollector
12
+from gitstats.htmlreportcreator import HTMLReportCreator
13
+from gitstats.miscfuncs import getgnuplotversion
14
+from gitstats import cli
15
+
16
+exectime_internal = 0.0
17
+exectime_external = 0.0
18
+
19
+def run():
20
+
21
+    conf, paths, outputpath = cli.get_cli()
22
+
23
+    logging.basicConfig(level=conf['logging'], format='%(message)s')
24
+    multiprocessing_logging.install_mp_handler()
25
+    time_start = time.time()
26
+
27
+
28
+    rundir = os.getcwd()
29
+
30
+    try:
31
+        os.makedirs(outputpath)
32
+    except OSError:
33
+        pass
34
+    if not os.path.isdir(outputpath):
35
+        logging.fatal('Output path is not a directory or does not exist')
36
+        sys.exit(1)
37
+
38
+    if not getgnuplotversion():
39
+        logging.error('gnuplot not found')
40
+        sys.exit(1)
41
+
42
+    logging.info(f'Output path: {outputpath}')
43
+    cachefile = os.path.join(outputpath, 'gitstats.cache')
44
+
45
+    data = GitDataCollector(conf)
46
+    data.loadCache(cachefile)
47
+
48
+    for gitpath in paths:
49
+        logging.info(f'Git path: {gitpath}')
50
+
51
+        prevdir = os.getcwd()
52
+        os.chdir(gitpath)
53
+
54
+        logging.info('Collecting data...')
55
+        data.collect(gitpath)
56
+
57
+        os.chdir(prevdir)
58
+
59
+    data.saveCache(cachefile)
60
+
61
+    logging.info('Refining data...')
62
+    data.refine()
63
+
64
+    os.chdir(rundir)
65
+
66
+    logging.info('Generating report...')
67
+    report = HTMLReportCreator(conf)
68
+    report.create(data, outputpath)
69
+
70
+    time_end = time.time()
71
+    calculated_exectime_internal = time_end - time_start
72
+    logging.info(f'Execution time {calculated_exectime_internal} secs, {exectime_external} secs ({(100.0 * exectime_external) / calculated_exectime_internal}%) in external commands)')
73
+
74
+    print('You may now run:')
75
+    print()
76
+    print('   sensible-browser \'%s\'' % os.path.join(outputpath, 'index.html').replace("'", "'\\''"))
77
+    print()
78
+
79
+
80
+if __name__ == '__main__':
81
+    run()