Bläddra i källkod

Significant rework:

* Read in full revision graph
* Use Tokei to get detailed LOC measurements for each rev
* Trim down number of csv files created
Dan Rapp 7 år sedan
förälder
incheckning
ed4ac3e13e

+ 2
- 0
.gitignore Visa fil

@@ -1,2 +1,4 @@
1 1
 **/__pycache__
2 2
 *.egg-info
3
+build
4
+dist

+ 14
- 0
gitstats/cli.py Visa fil

@@ -3,6 +3,8 @@ import logging
3 3
 import os
4 4
 import sys
5 5
 
6
+from datetime import datetime, timezone
7
+
6 8
 conf = {
7 9
     'max_domains': 10,
8 10
     'max_ext_length': 10,
@@ -15,6 +17,7 @@ conf = {
15 17
     'project_name': '',
16 18
     'processes': 8,
17 19
     'start_date': '',
20
+    'end_date': '',
18 21
     'logging': logging.INFO,
19 22
     'resrouce_file_pattern': '**/resources/**/*',
20 23
 }
@@ -58,3 +61,14 @@ def get_cli():
58 61
     outputpath = os.path.abspath(outputpath)
59 62
 
60 63
     return conf, paths, outputpath
64
+
65
+def get_begin_end_timestamps(conf):
66
+    if 'start_date' in conf and conf['start_date']:
67
+        begin = int(datetime.strptime(conf['start_date'], '%Y-%m-%d').replace(tzinfo=timezone.utc).timestamp())
68
+    else:
69
+        begin = 0
70
+    if 'end_date' in conf and conf['end_date']:
71
+        end = int(datetime.strptime(conf['end_date'], '%Y-%m-%d').replace(tzinfo=timezone.utc).timestamp())
72
+    else:
73
+        end = 99999999999
74
+    return begin, end

+ 3
- 1
gitstats/data/__init__.py Visa fil

@@ -5,4 +5,6 @@ from .tag import Tag
5 5
 from .revision import Revision
6 6
 from .file import File
7 7
 from .loc_by_date import LocByDate
8
-from .pr import PullRequest
8
+from .pr import PullRequest
9
+from .file_info import FileInfo
10
+from .revision_graph import RevisionGraph

+ 25
- 0
gitstats/data/file_info.py Visa fil

@@ -0,0 +1,25 @@
1
+from dataclasses import dataclass
2
+
3
+@dataclass
4
+class FileInfo:
5
+    language: str
6
+    file_count: int
7
+    line_count: int
8
+    code_line_count: int
9
+    comment_line_count: int
10
+    blank_line_count: int
11
+
12
+    def __post_init__(self):
13
+        self.file_count = int(self.file_count)
14
+        self.line_count = int(self.line_count)
15
+        self.code_line_count = int(self.code_line_count)
16
+        self.comment_line_count = int(self.comment_line_count)
17
+        self.blank_line_count = int(self.blank_line_count)
18
+
19
+    def __sub__(self, other: 'FileInfo') -> 'FileInfo':
20
+        return FileInfo(self.language,
21
+                        self.file_count - other.file_count,
22
+                        self.line_count - other.line_count,
23
+                        self.code_line_count - other.code_line_count,
24
+                        self.comment_line_count - other.comment_line_count,
25
+                        self.blank_line_count - other.blank_line_count)

+ 8
- 5
gitstats/data/revision.py Visa fil

@@ -1,9 +1,7 @@
1
-from collections import defaultdict
2 1
 from dataclasses import dataclass, field
2
+from .file_info import FileInfo
3 3
 from typing import Dict
4 4
 
5
-#    # Outputs "<stamp> <date> <time> <timezone> <author> '<' <mail> '>'"
6
-
7 5
 @dataclass
8 6
 class Revision:
9 7
     hash: str
@@ -12,5 +10,10 @@ class Revision:
12 10
     author: str = ''
13 11
     email: str = ''
14 12
     domain: str = ''
15
-    file_count: int = 0
16
-
13
+    comments: str = ''
14
+    master_pr: int = 0
15
+    branch_parent: str = ''
16
+    master_parent: str = ''
17
+    file_infos: Dict[str, FileInfo] = field(default_factory=lambda: {})
18
+    delta: Dict[str, FileInfo] = field(default_factory=lambda: {})
19
+    valid_pr: bool = True

+ 17
- 0
gitstats/data/revision_graph.py Visa fil

@@ -0,0 +1,17 @@
1
+from dataclasses import dataclass
2
+from typing import Dict, List, Set
3
+from gitstats.data.revision import Revision
4
+
5
+@dataclass
6
+class RevisionGraph:
7
+    revisions: Dict[str, Revision]
8
+    master_revs: Set[str]
9
+    linkage: Dict[str, List[str]]
10
+
11
+    def add_revision_to_graph(self, revision: Revision, parents: List[str], is_master: bool=False):
12
+        if not revision.hash in self.revisions:
13
+            self.revisions[revision.hash] = revision
14
+        if not revision.hash in self.linkage:
15
+            self.linkage[revision.hash] = parents
16
+        if revision.master_pr or is_master:
17
+            self.master_revs.add(revision.hash)

+ 3
- 1
gitstats/data_generators/__init__.py Visa fil

@@ -4,4 +4,6 @@ from .gen_tag_data import gen_tag_data
4 4
 from .gen_revision_data import gen_revision_data
5 5
 from .gen_file_data import gen_file_data
6 6
 from .gen_loc_data import gen_loc_data
7
-from .gen_pr_data import gen_pr_data
7
+from .gen_pr_data import gen_pr_data
8
+from .gen_revision_graph import gen_revision_graph
9
+from .gen_complete_file_info import gen_complete_file_info

+ 76
- 0
gitstats/data_generators/gen_complete_file_info.py Visa fil

@@ -0,0 +1,76 @@
1
+import csv
2
+import os
3
+
4
+from gitstats import cli, cd
5
+from gitstats.miscfuncs import getpipeoutput
6
+from gitstats.data import FileInfo, Revision, RevisionGraph
7
+from gitstats.data_generators import gen_revision_graph
8
+
9
+
10
+def gen_complete_file_info(graph: RevisionGraph):
11
+    '''
12
+    Given a dictionary of revisions on the master branch, collect all file info
13
+    using tokei for that revision
14
+
15
+    :param: master_rev - a dictionary of commit hash to Revision object for revisions on the master branch
16
+
17
+    :return: None. As a side effect, compliete file info by language type will be added to all
18
+    revisions in master_rev
19
+    '''
20
+
21
+    # use tokei to gather detailed file info for each revision on master
22
+    for revision in graph.master_revs:
23
+        getpipeoutput([f'git checkout {revision}'])
24
+        # for some reason if we combine these, tokei gives incorrect results!!!!
25
+        lines = getpipeoutput(['tokei']).split('\n')
26
+        for line in lines[3:-3] + [lines[-2]]:
27
+            line = line.strip()
28
+            file_info = FileInfo(*line.rsplit(maxsplit=5))
29
+            graph.revisions[revision].file_infos[file_info.language] = file_info
30
+
31
+    getpipeoutput(['git checkout master'])
32
+
33
+    # run through master revisions and calculate delta with previous master revision
34
+    for revision in graph.master_revs:
35
+        master_parent = graph.revisions[revision].master_parent
36
+        if master_parent in graph.master_revs:
37
+            current = graph.revisions[revision].file_infos
38
+            previous = graph.revisions[master_parent].file_infos
39
+            for lang, cur_file_info in current.items():
40
+                if lang in previous:
41
+                    graph.revisions[revision].delta[lang] = cur_file_info - previous[lang]
42
+                else:
43
+                    graph.revisions[revision].delta[lang] = cur_file_info
44
+
45
+
46
+if __name__ == "__main__":
47
+    conf, paths, outputpath = cli.get_cli()
48
+
49
+    with open(outputpath, 'w', encoding='utf8') as f:
50
+        writer = csv.writer(f)
51
+        writer.writerow(['repo', 'hash', 'stamp', 'author', 'language', 'files', 'lines', 'code', 'comments', 'blanks'])
52
+
53
+        for path in paths:
54
+            repo_name = os.path.split(path)[1]
55
+            with (cd.cd(path)):
56
+                graph = gen_revision_graph()
57
+                gen_complete_file_info(graph)
58
+
59
+                for rev in graph.master_revs:
60
+                    revision: Revision = graph.revisions[rev]
61
+                    for lang, file_info in revision.delta.items():
62
+                        if file_info.file_count or \
63
+                                file_info.line_count or \
64
+                                file_info.code_line_count or \
65
+                                file_info.comment_line_count or \
66
+                                file_info.blank_line_count:
67
+                            writer.writerow([repo_name,
68
+                                             revision.hash,
69
+                                             revision.stamp,
70
+                                             graph.revisions[revision.branch_parent].author,
71
+                                             lang,
72
+                                             file_info.file_count,
73
+                                             file_info.line_count,
74
+                                             file_info.code_line_count,
75
+                                             file_info.comment_line_count,
76
+                                             file_info.blank_line_count])

+ 23
- 61
gitstats/data_generators/gen_pr_data.py Visa fil

@@ -2,82 +2,41 @@ import csv
2 2
 import logging
3 3
 import os
4 4
 
5
-from multiprocessing import Pool
5
+from datetime import datetime
6 6
 
7 7
 from gitstats import cli, cd
8
-from gitstats.miscfuncs import getlogrange, getpipeoutput, gettimedelta
9
-from gitstats.data import PullRequest
8
+from gitstats.data import Revision, PullRequest, RevisionGraph
9
+from gitstats.data_generators import gen_revision_graph
10 10
 
11 11
 
12
-def gen_pr_data(conf, row_processor):
12
+def gen_pr_data(row_processor, graph: RevisionGraph):
13 13
     '''
14 14
     Given a configuration, pull revision information. For
15 15
     each author, callback to the row_processor passing an PullRequest
16 16
 
17
-    :param conf: configuration (mostly used for date limits)
17
+    As a side effect, every revision in the master_rev dictionary will be updated
18
+    with it's branch_parent and master_parent
19
+
18 20
     :param row_processor: function to receive the callback
19 21
     :return: None
20 22
     '''
21 23
 
22
-    prs = {} # hash -> PullRequest
23
-
24
-    # DBG: git log --all --grep="Merge pull request .* to master" --shortstat --pretty=format:"%H %at %aN" --since="2017-10-01" "HEAD"', 'grep -v ^commit'
25
-    lines = getpipeoutput(
26
-        ['git log --all --grep="Merge pull request .* to master" --shortstat '
27
-         '--pretty=format:"%%H %%at %%aN|%%P" %s' % getlogrange(conf, 'HEAD'),
28
-         'grep -v ^"files changed"']).split('\n')
29
-    for line in lines:
30
-        line = line.strip()
31
-        if line and not 'files changed' in line:
32
-            parts = line.split(' ', 2)
33
-            hash = parts[0]
34
-            try:
35
-                stamp = int(parts[1])
36
-            except ValueError:
37
-                stamp = 0
38
-            (author, parent_hashes) = parts[2].split('|')
39
-            parent_hashes = parent_hashes.split(' ')
40
-            if len(parent_hashes) == 2:
41
-                prs[hash] = PullRequest(stamp, hash, author, parent_hashes)
42
-
43
-    keys = prs.keys()
44
-    for pr in prs.values():
45
-        if pr.parent_hashes[0] in keys:
46
-            pr.master_rev = pr.parent_hashes[0]
47
-            if pr.parent_hashes[1] in keys:
48
-                logging.warning(f"Unexpected branching: {pr}")
49
-                pr.invalid_pr = True
50
-            else:
51
-                pr.branch_rev = pr.parent_hashes[1]
52
-        else:
53
-            pr.branch_rev = pr.parent_hashes[0]
54
-            if pr.parent_hashes[1] in keys:
55
-                pr.master_rev = pr.parent_hashes[1]
24
+    for rev in graph.master_revs:
25
+        revision = graph.revisions[rev]
26
+        if revision.valid_pr and revision.branch_parent in graph.revisions:
27
+            branch_rev: Revision = graph.revisions[revision.branch_parent]
28
+            delta = datetime.utcfromtimestamp(revision.stamp) - datetime.utcfromtimestamp(branch_rev.stamp)
29
+            if delta.total_seconds() < 0:
30
+                logging.warning(f"Unexpected. Negative duration: {rev}")
31
+                revision.valid_pr = False
56 32
             else:
57
-                logging.warning(f"Unexpected branching: {pr}")
58
-                pr.invalid_pr = True
59
-
60
-    prs_to_query = [(pr.hash, pr.stamp, pr.branch_rev) for pr in prs.values() if not pr.invalid_pr]
61
-
62
-    # # todo: consider putting in a cache for this. There was one in the original code
63
-    # # DBG:  git log -n 1 --format=%at "ceb3165b51ae0680724fd71e16a5ff836a0de41e"
64
-    pool = Pool(processes=conf['processes'])
65
-    time_deltas = pool.map(gettimedelta, prs_to_query)
66
-    pool.terminate()
67
-    pool.join()
68
-    for (hash, timedelta) in time_deltas:
69
-        pr = prs[hash]
70
-        pr.duration = timedelta
71
-        if pr.duration.total_seconds() < 0:
72
-            pr.invalid_pr = True
73
-            logging.warning(f"Unexpected. Negative duration: {pr}")
74
-        else:
75
-            row_processor(pr)
76
-
33
+                row_processor(PullRequest(revision.stamp, revision.hash, revision.author,
34
+                                          graph.linkage[rev], revision.branch_parent, rev, delta))
77 35
 
78 36
 
79 37
 if __name__ == "__main__":
80 38
     conf, paths, outputpath = cli.get_cli()
39
+    begin, end = cli.get_begin_end_timestamps(conf)
81 40
     with open(outputpath, 'w', encoding='utf8') as f:
82 41
         writer = csv.writer(f)
83 42
         writer.writerow(['repo', 'hash', 'stamp', 'masterRev', 'branchRev', 'prMergeDuration', 'prMergeDurationHr'])
@@ -85,6 +44,9 @@ if __name__ == "__main__":
85 44
         for path in paths:
86 45
             repo_name = os.path.split(path)[1]
87 46
             with (cd.cd(path)):
47
+                graph = gen_revision_graph()
48
+
88 49
                 def row_processor(row: PullRequest):
89
-                    writer.writerow([repo_name, row.hash, row.stamp, row.master_rev, row.branch_rev, row.duration.total_seconds(), row.duration])
90
-                gen_pr_data(conf, row_processor)
50
+                    if row.stamp >= begin and row.stamp <= end:
51
+                        writer.writerow([repo_name, row.hash, row.stamp, row.master_rev, row.branch_rev, row.duration.total_seconds(), row.duration])
52
+                gen_pr_data(row_processor, graph)

+ 19
- 47
gitstats/data_generators/gen_revision_data.py Visa fil

@@ -4,74 +4,46 @@ import os
4 4
 from multiprocessing import Pool
5 5
 
6 6
 from gitstats import cli, cd
7
-from gitstats.miscfuncs import getlogrange, getpipeoutput, getnumoffilesfromrev
8
-from gitstats.data import Revision
7
+from gitstats.miscfuncs import getnumoffilesfromrev
8
+from gitstats.data import Revision, RevisionGraph
9
+from gitstats.data_generators.gen_revision_graph import gen_revision_graph
9 10
 
10 11
 
11
-def gen_revision_data(conf, row_processor):
12
+def gen_revision_data(conf, row_processor, graph: RevisionGraph):
12 13
     '''
13 14
     Given a configuration, pull revision information. For
14 15
     each author, callback to the row_processor passing an Revision
15 16
 
16 17
     :param conf: configuration (mostly used for date limits)
17 18
     :param row_processor: function to receive the callback
18
-    :return: Number of commits
19
+    :return: None
19 20
     '''
20 21
 
21
-    revisions = {} # tree_hash -> Revision
22
-    # Collect revision statistics
23
-    # Outputs "<stamp> <date> <time> <timezone> <author> '<' <mail> '>'"
24
-
25
-    # DBG: git rev-list --pretty=format:"%at %ai %aN <%aE>" --since="2017-10-01" "HEAD"', 'grep -v ^commit'
26
-    lines = getpipeoutput(
27
-        ['git rev-list --pretty=format:"%%T %%H %%at %%ai %%aN <%%aE>" %s' % getlogrange(conf, 'HEAD'),
28
-         'grep -v ^commit']).split('\n')
29
-    for line in lines:
30
-        line = line.strip()
31
-        if line:
32
-            parts = line.split(' ', 6)
33
-            tree_hash = parts[0]
34
-            sha = parts[1]
35
-            try:
36
-                stamp = int(parts[2])
37
-            except ValueError:
38
-                stamp = 0
39
-            timezone = parts[5]
40
-            author, mail = parts[6].split('<', 1)
41
-            author = author.strip()
42
-            mail = mail.rstrip('>')
43
-            domain = '?'
44
-            if mail.find('@') != -1:
45
-                domain = mail.rsplit('@', 1)[1]
46
-                domain.rstrip('>')
47
-            revisions[tree_hash] = Revision(sha, stamp, timezone, author, mail, domain)
48
-
49
-    if revisions:
50
-        # todo: consider putting in a cache for this. There was one in the original code
51
-        # DBG: git ls-tree -r --name-only "ceb3165b51ae0680724fd71e16a5ff836a0de41e"', 'wc -l'
52
-        pool = Pool(processes=conf['processes'])
53
-        rev_count = pool.map(getnumoffilesfromrev, revisions.keys())
54
-        pool.terminate()
55
-        pool.join()
56
-        # Update cache with new revisions and append then to general list
57
-        for (rev, count) in rev_count:
58
-            revision = revisions[rev]
59
-            revision.file_count = count
60
-            row_processor(revision)
61
-
62
-    return len(lines)
22
+    # todo: consider putting in a cache for this. There was one in the original code
23
+    # DBG: git ls-tree -r --name-only "ceb3165b51ae0680724fd71e16a5ff836a0de41e"', 'wc -l'
24
+    pool = Pool(processes=conf['processes'])
25
+    rev_count = pool.map(getnumoffilesfromrev, graph.revisions.keys())
26
+    pool.terminate()
27
+    pool.join()
28
+    # Update cache with new revisions and append then to general list
29
+    for (rev, count) in rev_count:
30
+        revision = graph.revisions[rev]
31
+        revision.file_count = count
32
+        row_processor(revision)
63 33
 
64 34
 
65 35
 if __name__ == "__main__":
66 36
     conf, paths, outputpath = cli.get_cli()
37
+    begin, end = cli.get_begin_end_timestamps(conf)
67 38
     with open(outputpath, 'w', encoding='utf8') as f:
68 39
         writer = csv.writer(f)
69 40
         writer.writerow(['repo', 'sha', 'stamp', 'timezone', 'author', 'email', 'domain', 'files_changed'])
70 41
 
71 42
         for path in paths:
43
+            rev_by_tree_hash, _, _, _ = gen_revision_graph(begin, end)
72 44
             repo_name = os.path.split(path)[1]
73 45
             with (cd.cd(path)):
74 46
                 def row_processor(row: Revision):
75 47
                     writer.writerow([repo_name, row.hash, row.stamp, row.timezone, row.author, row.email,
76 48
                                      row.domain, row.file_count])
77
-                gen_revision_data(conf, row_processor)
49
+                gen_revision_data(conf, row_processor, rev_by_tree_hash)

+ 114
- 0
gitstats/data_generators/gen_revision_graph.py Visa fil

@@ -0,0 +1,114 @@
1
+import logging
2
+import os
3
+import re
4
+
5
+from typing import Dict
6
+from gitstats import cli, cd
7
+from gitstats.miscfuncs import getpipeoutput
8
+from gitstats.data import Revision, RevisionGraph
9
+
10
+
11
+def gen_revision_graph() -> RevisionGraph:
12
+    '''
13
+    Given beginning and ending time stamp, get all revisions from the repo within that range,
14
+    key them by tree_hash, commit_hash as well as create a graph of revisions and a list
15
+    of revisions merging to master
16
+
17
+    :return: RevisionGraph
18
+    '''
19
+
20
+    # this match string for PRs merged to master is particular to BitBucket
21
+    # probably should come from configuration
22
+    prog = re.compile(r'Merge pull request #([0-9]*) in.*to master')
23
+
24
+    graph = RevisionGraph({}, set(), {})
25
+
26
+    lines = getpipeoutput(
27
+        [f'git rev-list --pretty="%T|%H|%at|%ai|%aN|%aE|%P|%s" "HEAD"',
28
+         'grep -v ^commit']).split('\n')
29
+    for line in lines:
30
+        line = line.strip()
31
+        if line:
32
+            graph.add_revision_to_graph(*get_revision_from_line(line, prog))
33
+
34
+    new_masters = set()
35
+    for rev in graph.master_revs:
36
+        parents = graph.linkage[rev]
37
+        revision: Revision = graph.revisions[rev]
38
+        for parent in parents:
39
+            if parent in graph.master_revs:
40
+                if revision.master_parent:
41
+                    logging.warning(f"{rev} has multiple master parents")
42
+                    revision.valid_pr = False
43
+                revision.master_parent = parent
44
+            else:
45
+                if revision.branch_parent:
46
+                    if not revision.master_parent:
47
+                        # we likely have a merge into master in a branch that didn't use
48
+                        # bitbucket conventions... arbitrarily choose the oldest parent
49
+                        # revision as the master branch (we could back chain both and find
50
+                        # which branch exists in the ancestry of the other, but for now,
51
+                        # this will suffice
52
+                        if graph.revisions[parent].stamp < graph.revisions[revision.branch_parent].stamp:
53
+                            revision.master_parent = parent
54
+                            new_masters.add(parent)
55
+                        else:
56
+                            revision.master_parent = revision.branch_parent
57
+                            new_masters.add(revision.branch_parent)
58
+                            revision.branch_parent = parent
59
+                    else:
60
+                        logging.warning(f"{rev} has multiple branch parents")
61
+                        revision.valid_pr = False
62
+                else:
63
+                    revision.branch_parent = parent
64
+    graph.master_revs.update(new_masters)
65
+
66
+    # validate masters based on git log --first-parent
67
+    lines = getpipeoutput(
68
+        ['git log --first-parent --pretty="%T|%H|%at|%ai|%aN|%aE|%P|%s"',
69
+         'grep -v ^commit']).split('\n')
70
+    for line in lines:
71
+        line = line.strip()
72
+        if line:
73
+            graph.add_revision_to_graph(*get_revision_from_line(line, prog), is_master=True)
74
+
75
+    # update master branch as appropriate
76
+    for rev in graph.master_revs:
77
+        if not graph.revisions[rev].master_parent:
78
+            parents = graph.linkage[rev]
79
+            if len(parents) == 1 and parents[0]:
80
+                graph.revisions[rev].master_parent = parents[0]
81
+            else:
82
+                if parents[0]:
83
+                    logging.warning(f"{rev} has no master parent info. {parents}")
84
+
85
+    return graph
86
+
87
+
88
+def get_revision_from_line(line, prog):
89
+    tree_hash, sha, stamp, time, author, mail, parents, comments = line.split('|', 7)
90
+    try:
91
+        stamp = int(stamp)
92
+    except ValueError:
93
+        stamp = 0
94
+    timezone = time.split(' ')[2]
95
+    domain = '?'
96
+    if mail.find('@') != -1:
97
+        domain = mail.rsplit('@', 1)[1]
98
+    parents = parents.split(' ')
99
+    revision = Revision(sha, stamp, timezone, author, mail, domain, comments)
100
+    match = prog.search(comments)
101
+    if match:
102
+        revision.master_pr = int(match.group(1))
103
+    return revision, parents
104
+
105
+
106
+if __name__ == "__main__":
107
+    conf, paths, outputpath = cli.get_cli()
108
+    graphs: Dict[str, RevisionGraph] = {}
109
+    for path in paths:
110
+        repo_name = os.path.split(path)[1]
111
+        with (cd.cd(path)):
112
+            graphs[repo_name] = gen_revision_graph()
113
+    for k, v in graphs.items():
114
+        print(f"{k}: {len(v.revisions)} revisions, {len(v.master_revs)} revisions on master")

+ 111
- 83
gitstats/git_csv_generator.py Visa fil

@@ -1,17 +1,18 @@
1 1
 #! /usr/bin/env python3
2 2
 import csv
3
-import glob
4 3
 import logging
5 4
 import os
6 5
 import sys
7 6
 
8 7
 import multiprocessing_logging
8
+
9
+from collections import defaultdict
10
+
9 11
 from gitstats.cd import cd
10 12
 
11 13
 from gitstats import cli
12
-from gitstats.data import AuthorTotals, AuthorRow, File, LocByDate, PullRequest, Revision, Tag
13
-from gitstats.data_generators import gen_author_data, gen_author_totals_data, gen_tag_data, gen_revision_data, \
14
-    gen_file_data, gen_loc_data, gen_pr_data
14
+from gitstats.data import PullRequest, Revision
15
+from gitstats.data_generators import gen_pr_data, gen_revision_graph, gen_complete_file_info
15 16
 
16 17
 exectime_internal = 0.0
17 18
 exectime_external = 0.0
@@ -19,48 +20,39 @@ exectime_external = 0.0
19 20
 
20 21
 class _FileHandles:
21 22
     def __init__(self, output_dir):
22
-        self.author_info = open(os.path.join(output_dir, 'authors.csv'), 'w', encoding='utf8')
23
-        self.author_info_writer = csv.writer(self.author_info)
24
-        self.author_info_writer.writerow(['Repo', 'CommitHash', 'TimeStamp', 'Author', 'FilesChanged', 'LinesInserted',
25
-                                          'LinesDeleted'])
26
-
27 23
         self.author_totals_info = open(os.path.join(output_dir, 'author_totals.csv'), 'w', encoding='utf8')
28 24
         self.author_totals_info_writer = csv.writer(self.author_totals_info)
29 25
         self.author_totals_info_writer.writerow(["Repo", "Author", "Commits"])
30 26
 
31
-        self.tag_info = open(os.path.join(output_dir, 'tags.csv'), 'w', encoding='utf8')
32
-        self.tag_info_writer = csv.writer(self.tag_info)
33
-        self.tag_info_writer.writerow(["Repo", "CommitHash", "Timestamp", "TotalCommits", "Author", "AuthorCommits"])
34
-
35 27
         self.revision_info = open(os.path.join(output_dir, 'revs.csv'), 'w', encoding='utf8')
36 28
         self.revision_info_writer = csv.writer(self.revision_info)
37 29
         self.revision_info_writer.writerow(['Repo', 'CommitHash', 'TimeStamp', 'TimeZone', 'Author', 'AuthorEmail',
38
-                                            'Domain', 'FilesChanged'])
39
-
40
-        self.files_info = open(os.path.join(output_dir, 'files.csv'), 'w', encoding='utf8')
41
-        self.files_info_writer = csv.writer(self.files_info)
42
-        self.files_info_writer.writerow(['Repo', 'File', 'Ext', 'Size', 'Lines', 'Resource'])
30
+                                            'Domain'])
43 31
 
44 32
         self.loc_info = open(os.path.join(output_dir, 'loc.csv'), 'w', encoding='utf8')
45 33
         self.loc_info_writer = csv.writer(self.loc_info)
46
-        self.loc_info_writer.writerow(['Repo', 'CommitHash', 'TimeStamp', 'FileCount', 'LinesInserted', 'LinesDeleted',
47
-                                       'TotalLines'])
34
+        self.loc_info_writer.writerow(['repo', 'hash', 'stamp', 'language', 'files', 'lines', 'code', 'comments',
35
+                                       'blanks'])
36
+
37
+        self.loc_delta = open(os.path.join(output_dir, 'loc_delta.csv'), 'w', encoding='utf8')
38
+        self.loc_delta_writer = csv.writer(self.loc_delta)
39
+        self.loc_delta_writer.writerow(['repo', 'hash', 'stamp', 'author', 'language', 'files', 'lines', 'code',
40
+                                        'comments', 'blanks'])
48 41
 
49 42
         self.repo_info = open(os.path.join(output_dir, 'repo.csv'), 'w', encoding='utf8')
50 43
         self.repo_info_writer = csv.writer(self.repo_info)
51
-        self.repo_info_writer.writerow(['Repo', 'TotalFiles', 'TotalLines'])
44
+        self.repo_info_writer.writerow(['Repo', 'Language', 'TotalFiles', 'TotalLines', 'TotalCodeLines', 'TotalCommentLlines',
45
+                                        'TotalBlankLines'])
52 46
 
53 47
         self.prs_info = open(os.path.join(output_dir, 'prs.csv'), 'w', encoding='utf8')
54 48
         self.prs_info_writer = csv.writer(self.prs_info)
55 49
         self.prs_info_writer.writerow(['Repo', 'CommitHash', 'TimeStamp', 'ParentHashMaster', 'ParentHashBranch', 'PrMergeDuration'])
56 50
 
57 51
     def close(self):
58
-        self.author_info.close()
59 52
         self.author_totals_info.close()
60
-        self.tag_info.close()
61 53
         self.revision_info.close()
62
-        self.files_info.close()
63 54
         self.loc_info.close()
55
+        self.loc_delta.close()
64 56
         self.repo_info.close()
65 57
         self.prs_info.close()
66 58
 
@@ -69,8 +61,7 @@ class GitCsvGenerator():
69 61
         self.conf = conf
70 62
         self.files: _FileHandles = None
71 63
         self.output_dir = output_dir
72
-        self.resource_files = []
73
-        self.igore_files = ''
64
+        self.begin, self.end = cli.get_begin_end_timestamps(conf)
74 65
 
75 66
     def __enter__(self):
76 67
         self.files = _FileHandles(self.output_dir)
@@ -81,72 +72,109 @@ class GitCsvGenerator():
81 72
     def collect(self, dir):
82 73
 
83 74
         with cd(dir):
84
-            self.resource_files = [file for file in glob.glob(self.conf['resrouce_file_pattern'], recursive=True) if os.path.isfile(file)]
85
-
86
-            if self.resource_files:
87
-                self.ignore_files = '" "'.join([f":(exclude){file}" for file in self.resource_files])
88
-                self.ignore_files = f'-- "{self.ignore_files}"'
89
-
90 75
             if len(self.conf['project_name']) == 0:
91 76
                 self.projectname = os.path.basename(os.path.abspath(dir))
92 77
             else:
93 78
                 self.projectname = self.conf['project_name']
94 79
 
95
-            self.get_total_authors()
96
-            self.get_tags()
97
-            self.get_revision_info()
98
-            self.get_file_info()
99
-            self.get_loc_info()
100
-            self.get_author_info()
101
-            self.get_pr_info()
80
+            graph = gen_revision_graph()
81
+            gen_complete_file_info(graph)
82
+
83
+            self.extract_total_authors(graph)
84
+            self.extract_pr_info(graph)
85
+            self.extract_code_info(graph)
86
+            self.extract_revision_info(graph)
87
+            # self.get_revision_info(graph)
88
+            # self.get_tags()
89
+            # self.get_file_info()
90
+            # self.get_loc_info()
91
+            # self.get_author_info()
102 92
 
103
-    def get_total_authors(self):
93
+    def extract_total_authors(self, graph):
104 94
         logging.info(f"Getting author totals for {self.projectname}")
105
-        def row_processor(row: AuthorTotals):
106
-            self.files.author_totals_info_writer.writerow([self.projectname, row.author, row.total_commits])
107
-        gen_author_totals_data(self.conf, row_processor)
108
-
109
-    def get_tags(self):
110
-        logging.info(f"Getting tag info for {self.projectname}")
111
-        def row_processor(row: Tag):
112
-            for author, commits in row.authors.items():
113
-                self.files.tag_info_writer.writerow([self.projectname, row.hash, row.stamp, row.commits, author, commits])
114
-        gen_tag_data(self.conf, row_processor)
115
-
116
-    def get_revision_info(self):
117
-        logging.info(f"Getting rev info for {self.projectname}")
118
-        def row_processor(row: Revision):
119
-            self.files.revision_info_writer.writerow([self.projectname, row.hash, row.stamp, row.timezone, row.author,
120
-                                                      row.email, row.domain, row.file_count])
121
-        gen_revision_data(self.conf, row_processor)
122
-
123
-    def get_file_info(self):
124
-        logging.info(f"Getting file info for {self.projectname}")
125
-        def row_processor(row: File):
126
-            self.files.files_info_writer.writerow([self.projectname, row.full_path, row.ext, row.size, row.lines, row.full_path in self.resource_files])
127
-        gen_file_data(self.conf, row_processor)
128
-
129
-    def get_loc_info(self):
130
-        logging.info(f"Getting LOC info for {self.projectname}")
131
-        def row_processor(row: LocByDate):
132
-            self.files.loc_info_writer.writerow([self.projectname, row.hash, row.stamp, row.file_count,
133
-                                                 row.lines_inserted, row.lines_deleted, row.total_lines])
134
-        total_files, total_lines = gen_loc_data(self.conf, row_processor, self.ignore_files)
135
-        self.files.repo_info_writer.writerow([self.projectname, total_files, total_lines])
136
-
137
-    def get_author_info(self):
138
-        logging.info(f"Getting author info for {self.projectname}")
139
-        def row_processor(row: AuthorRow):
140
-            self.files.author_info_writer.writerow([self.projectname, row.hash, row.stamp, row.author,
141
-                                                    row.files_modified, row.lines_inserted, row.lines_deleted])
142
-        gen_author_data(self.conf, row_processor, self.ignore_files)
143
-
144
-    def get_pr_info(self):
95
+
96
+        authors = defaultdict(int)
97
+        for rev in graph.revisions.values():
98
+            # don't include merge to master as a commit in counting total author
99
+            # commits.
100
+            if rev.stamp >= self.begin and rev.stamp <= self.end and rev.master_pr == 0:
101
+                authors[rev.author] += 1
102
+
103
+        for author, total_commits in authors.items():
104
+            self.files.author_totals_info_writer.writerow([self.projectname, author, total_commits])
105
+
106
+    def extract_pr_info(self, graph):
145 107
         logging.info(f"Getting pull request info for {self.projectname}")
146 108
         def row_processor(row: PullRequest):
147
-            self.files.prs_info_writer.writerow([self.projectname, row.hash, row.stamp, row.master_rev,
148
-                                                    row.branch_rev, row.duration.total_seconds()])
149
-        gen_pr_data(self.conf, row_processor)
109
+            if row.stamp >= self.begin and row.stamp <= self.end:
110
+                self.files.prs_info_writer.writerow([self.projectname, row.hash, row.stamp, row.master_rev,
111
+                                                        row.branch_rev, row.duration.total_seconds()])
112
+        gen_pr_data(row_processor, graph)
113
+
114
+    def extract_code_info(self, graph):
115
+        rev_max: Revision = None
116
+        for rev in graph.master_revs:
117
+            revision: Revision = graph.revisions[rev]
118
+            if not rev_max or revision.stamp > rev_max.stamp:
119
+                rev_max = revision
120
+            if revision.stamp >= self.begin and revision.stamp <= self.end:
121
+                for lang, file_info in revision.delta.items():
122
+                        if file_info.file_count or \
123
+                                file_info.line_count or \
124
+                                file_info.code_line_count or \
125
+                                file_info.comment_line_count or \
126
+                                file_info.blank_line_count:
127
+
128
+                            if revision.branch_parent in graph.revisions:
129
+                                parent = revision.branch_parent
130
+                            else:
131
+                                parent = revision.master_parent
132
+                            if parent:
133
+                                self.files.loc_delta_writer.writerow([self.projectname,
134
+                                                 revision.hash,
135
+                                                 revision.stamp,
136
+                                                 graph.revisions[parent].author,
137
+                                                 lang,
138
+                                                 file_info.file_count,
139
+                                                 file_info.line_count,
140
+                                                 file_info.code_line_count,
141
+                                                 file_info.comment_line_count,
142
+                                                 file_info.blank_line_count])
143
+                for lang, file_info in revision.file_infos.items():
144
+                        if file_info.file_count or \
145
+                                file_info.line_count or \
146
+                                file_info.code_line_count or \
147
+                                file_info.comment_line_count or \
148
+                                file_info.blank_line_count:
149
+                            self.files.loc_info_writer.writerow([self.projectname,
150
+                                             revision.hash,
151
+                                             revision.stamp,
152
+                                             lang,
153
+                                             file_info.file_count,
154
+                                             file_info.line_count,
155
+                                             file_info.code_line_count,
156
+                                             file_info.comment_line_count,
157
+                                             file_info.blank_line_count])
158
+
159
+        for file_info in rev_max.file_infos.values():
160
+            self.files.repo_info_writer.writerow([self.projectname,
161
+                                                  file_info.language,
162
+                                                  file_info.file_count,
163
+                                                  file_info.line_count,
164
+                                                  file_info.code_line_count,
165
+                                                  file_info.comment_line_count,
166
+                                                  file_info.blank_line_count])
167
+
168
+    def extract_revision_info(self, graph):
169
+        for revision in graph.revisions.values():
170
+            if revision.stamp >= self.begin and revision.stamp <= self.end:
171
+                self.files.revision_info_writer.writerow([self.projectname,
172
+                                                          revision.hash,
173
+                                                          revision.stamp,
174
+                                                          revision.timezone,
175
+                                                          revision.author,
176
+                                                          revision.email,
177
+                                                          revision.domain])
150 178
 
151 179
 def gen_csv():
152 180
     conf, paths, outputpath = cli.get_cli()