Bladeren bron

Dwr/pull request tracking (#1)

* Begin looking at code to track pull requests (just looking at bitbucket for now, should translate into github and gitlabs)

* Finish PullRequest duration calculation

* Finish PullRequest duration calculation

* Finish PullRequest duration calculation

* use total_seconds duration for PR analysis
Daniel Rapp 7 jaren geleden
bovenliggende
commit
741dfb56b6
Geen account gekoppeld aan de committers e-mail
4 gewijzigde bestanden met toevoegingen van 132 en 2 verwijderingen
  1. 15
    0
      gitstats/data/pr.py
  2. 90
    0
      gitstats/data_generators/gen_pr_data.py
  3. 15
    2
      gitstats/git_csv_generator.py
  4. 12
    0
      gitstats/miscfuncs.py

+ 15
- 0
gitstats/data/pr.py Bestand weergeven

@@ -0,0 +1,15 @@
1
+from datetime import timedelta
2
+from dataclasses import dataclass
3
+from typing import List
4
+
5
+@dataclass
6
+class PullRequest:
7
+    stamp: int
8
+    hash: str
9
+    author: str
10
+    parent_hashes: List[str]
11
+    branch_rev: str = None
12
+    master_rev: str = None
13
+    duration: timedelta = None
14
+    invalid_pr: bool = False
15
+

+ 90
- 0
gitstats/data_generators/gen_pr_data.py Bestand weergeven

@@ -0,0 +1,90 @@
1
+import csv
2
+import logging
3
+import os
4
+
5
+from multiprocessing import Pool
6
+
7
+from gitstats import cli, cd
8
+from gitstats.miscfuncs import getlogrange, getpipeoutput, gettimedelta
9
+from gitstats.data import PullRequest
10
+
11
+
12
+def gen_pr_data(conf, row_processor):
13
+    '''
14
+    Given a configuration, pull revision information. For
15
+    each author, callback to the row_processor passing an PullRequest
16
+
17
+    :param conf: configuration (mostly used for date limits)
18
+    :param row_processor: function to receive the callback
19
+    :return: None
20
+    '''
21
+
22
+    prs = {} # hash -> PullRequest
23
+
24
+    # DBG: git log --all --grep="Merge pull request .* to master" --shortstat --pretty=format:"%H %at %aN" --since="2017-10-01" "HEAD"', 'grep -v ^commit'
25
+    lines = getpipeoutput(
26
+        ['git log --all --grep="Merge pull request .* to master" --shortstat '
27
+         '--pretty=format:"%%H %%at %%aN|%%P" %s' % getlogrange(conf, 'HEAD'),
28
+         'grep -v ^"files changed"']).split('\n')
29
+    for line in lines:
30
+        line = line.strip()
31
+        if line and not 'files changed' in line:
32
+            parts = line.split(' ', 2)
33
+            hash = parts[0]
34
+            try:
35
+                stamp = int(parts[1])
36
+            except ValueError:
37
+                stamp = 0
38
+            (author, parent_hashes) = parts[2].split('|')
39
+            parent_hashes = parent_hashes.split(' ')
40
+            if len(parent_hashes) == 2:
41
+                prs[hash] = PullRequest(stamp, hash, author, parent_hashes)
42
+
43
+    keys = prs.keys()
44
+    for pr in prs.values():
45
+        if pr.parent_hashes[0] in keys:
46
+            pr.master_rev = pr.parent_hashes[0]
47
+            if pr.parent_hashes[1] in keys:
48
+                logging.warning(f"Unexpected branching: {pr}")
49
+                pr.invalid_pr = True
50
+            else:
51
+                pr.branch_rev = pr.parent_hashes[1]
52
+        else:
53
+            pr.branch_rev = pr.parent_hashes[0]
54
+            if pr.parent_hashes[1] in keys:
55
+                pr.master_rev = pr.parent_hashes[1]
56
+            else:
57
+                logging.warning(f"Unexpected branching: {pr}")
58
+                pr.invalid_pr = True
59
+
60
+    prs_to_query = [(pr.hash, pr.stamp, pr.branch_rev) for pr in prs.values() if not pr.invalid_pr]
61
+
62
+    # # todo: consider putting in a cache for this. There was one in the original code
63
+    # # DBG:  git log -n 1 --format=%at "ceb3165b51ae0680724fd71e16a5ff836a0de41e"
64
+    pool = Pool(processes=conf['processes'])
65
+    time_deltas = pool.map(gettimedelta, prs_to_query)
66
+    pool.terminate()
67
+    pool.join()
68
+    for (hash, timedelta) in time_deltas:
69
+        pr = prs[hash]
70
+        pr.duration = timedelta
71
+        if pr.duration.total_seconds() < 0:
72
+            pr.invalid_pr = True
73
+            logging.warning(f"Unexpected. Negative duration: {pr}")
74
+        else:
75
+            row_processor(pr)
76
+
77
+
78
+
79
+if __name__ == "__main__":
80
+    conf, paths, outputpath = cli.get_cli()
81
+    with open(outputpath, 'w', encoding='utf8') as f:
82
+        writer = csv.writer(f)
83
+        writer.writerow(['repo', 'hash', 'stamp', 'masterRev', 'branchRev', 'prMergeDuration', 'prMergeDurationHr'])
84
+
85
+        for path in paths:
86
+            repo_name = os.path.split(path)[1]
87
+            with (cd.cd(path)):
88
+                def row_processor(row: PullRequest):
89
+                    writer.writerow([repo_name, row.hash, row.stamp, row.master_rev, row.branch_rev, row.duration.total_seconds(), row.duration])
90
+                gen_pr_data(conf, row_processor)

+ 15
- 2
gitstats/git_csv_generator.py Bestand weergeven

@@ -6,9 +6,9 @@ import sys
6 6
 import multiprocessing_logging
7 7
 
8 8
 from gitstats import cli
9
-from gitstats.data import AuthorTotals, AuthorRow, File, LocByDate, Revision, Tag
9
+from gitstats.data import AuthorTotals, AuthorRow, File, LocByDate, PullRequest, Revision, Tag
10 10
 from gitstats.data_generators import gen_author_data, gen_author_totals_data, gen_tag_data, gen_revision_data, \
11
-    gen_file_data, gen_loc_data
11
+    gen_file_data, gen_loc_data, gen_pr_data
12 12
 
13 13
 exectime_internal = 0.0
14 14
 exectime_external = 0.0
@@ -47,6 +47,10 @@ class _FileHandles:
47 47
         self.repo_info_writer = csv.writer(self.repo_info)
48 48
         self.repo_info_writer.writerow(['Repo', 'TotalFiles', 'TotalLines'])
49 49
 
50
+        self.prs_info = open(os.path.join(output_dir, 'prs.csv'), 'w', encoding='utf8')
51
+        self.prs_info_writer = csv.writer(self.prs_info)
52
+        self.prs_info_writer.writerow(['Repo', 'CommitHash', 'TimeStamp', 'ParentHashMaster', 'ParentHashBranch', 'PrMergeDuration'])
53
+
50 54
     def close(self):
51 55
         self.author_info.close()
52 56
         self.author_totals_info.close()
@@ -55,6 +59,7 @@ class _FileHandles:
55 59
         self.files_info.close()
56 60
         self.loc_info.close()
57 61
         self.repo_info.close()
62
+        self.prs_info.close()
58 63
 
59 64
 
60 65
 class GitCsvGenerator():
@@ -81,6 +86,7 @@ class GitCsvGenerator():
81 86
         self.get_file_info()
82 87
         self.get_loc_info()
83 88
         self.get_author_info()
89
+        self.get_pr_info()
84 90
 
85 91
     def get_total_authors(self):
86 92
         logging.info(f"Getting author totals for {self.projectname}")
@@ -123,6 +129,13 @@ class GitCsvGenerator():
123 129
                                                     row.files_modified, row.lines_inserted, row.lines_deleted])
124 130
         gen_author_data(self.conf, row_processor)
125 131
 
132
+    def get_pr_info(self):
133
+        logging.info(f"Getting pull request info for {self.projectname}")
134
+        def row_processor(row: PullRequest):
135
+            self.files.prs_info_writer.writerow([self.projectname, row.hash, row.stamp, row.master_rev,
136
+                                                    row.branch_rev, row.duration.total_seconds()])
137
+        gen_pr_data(self.conf, row_processor)
138
+
126 139
 def gen_csv():
127 140
     conf, paths, outputpath = cli.get_cli()
128 141
 

+ 12
- 0
gitstats/miscfuncs.py Bestand weergeven

@@ -1,3 +1,4 @@
1
+import datetime
1 2
 import logging
2 3
 import os
3 4
 import re
@@ -96,4 +97,15 @@ def getnumoflinesinblob(blob_id):
96 97
     # DBG: git cat-file blob e4f17a621893811250be96c8ef9c37b5e97a1df7', 'wc -l'
97 98
     return blob_id, int(getpipeoutput(['git cat-file blob %s' % blob_id, 'wc -l']).split()[0])
98 99
 
100
+def gettimedelta(sha_tuple):
101
+    """
102
+    Get the time delta between the time stamp passed in the tuple ([1]) and the sha of the second rev in the tuple ([2])
103
+    return the result, keyed by the sha of the first rev in the tuple ([0])
104
+    """
105
+    # DBG:  git log -n 1 --format=%at "ceb3165b51ae0680724fd71e16a5ff836a0de41e"'
106
+    timestamp_branch = int(getpipeoutput([' git log -n 1 --format=%%at "%s"' % sha_tuple[2]]).split('\n')[0])
107
+    delta = datetime.datetime.utcfromtimestamp(sha_tuple[1]) - datetime.datetime.utcfromtimestamp(timestamp_branch)
108
+
109
+    return (sha_tuple[0], delta)
110
+
99 111