#define USE_THE_REPOSITORY_VARIABLE #include "git-compat-util.h" #include "refs.h" #include "object-store-ll.h" #include "cache-tree.h" #include "mergesort.h" #include "commit.h" #include "convert.h" #include "diff.h" #include "diffcore.h" #include "gettext.h" #include "hex.h" #include "path.h" #include "read-cache.h" #include "revision.h" #include "setup.h" #include "tag.h" #include "trace2.h" #include "blame.h" #include "alloc.h" #include "commit-slab.h" #include "bloom.h" #include "commit-graph.h" define_commit_slab(blame_suspects, struct blame_origin *); static struct blame_suspects blame_suspects; struct blame_origin *get_blame_suspects(struct commit *commit) { … } static void set_blame_suspects(struct commit *commit, struct blame_origin *origin) { … } void blame_origin_decref(struct blame_origin *o) { … } /* * Given a commit and a path in it, create a new origin structure. * The callers that add blame to the scoreboard should use * get_origin() to obtain shared, refcounted copy instead of calling * this function directly. */ static struct blame_origin *make_origin(struct commit *commit, const char *path) { … } /* * Locate an existing origin or create a new one. * This moves the origin to front position in the commit util list. */ static struct blame_origin *get_origin(struct commit *commit, const char *path) { … } static void verify_working_tree_path(struct repository *r, struct commit *work_tree, const char *path) { … } static struct commit_list **append_parent(struct repository *r, struct commit_list **tail, const struct object_id *oid) { … } static void append_merge_parents(struct repository *r, struct commit_list **tail) { … } /* * This isn't as simple as passing sb->buf and sb->len, because we * want to transfer ownership of the buffer to the commit (so we * must use detach). */ static void set_commit_buffer_from_strbuf(struct repository *r, struct commit *c, struct strbuf *sb) { … } /* * Prepare a dummy commit that represents the work tree (or staged) item. * Note that annotating work tree item never works in the reverse. */ static struct commit *fake_working_tree_commit(struct repository *r, struct diff_options *opt, const char *path, const char *contents_from, struct object_id *oid) { … } static int diff_hunks(mmfile_t *file_a, mmfile_t *file_b, xdl_emit_hunk_consume_func_t hunk_func, void *cb_data, int xdl_opts) { … } static const char *get_next_line(const char *start, const char *end) { … } static int find_line_starts(int **line_starts, const char *buf, unsigned long len) { … } struct fingerprint_entry; /* A fingerprint is intended to loosely represent a string, such that two * fingerprints can be quickly compared to give an indication of the similarity * of the strings that they represent. * * A fingerprint is represented as a multiset of the lower-cased byte pairs in * the string that it represents. Whitespace is added at each end of the * string. Whitespace pairs are ignored. Whitespace is converted to '\0'. * For example, the string "Darth Radar" will be converted to the following * fingerprint: * {"\0d", "da", "da", "ar", "ar", "rt", "th", "h\0", "\0r", "ra", "ad", "r\0"} * * The similarity between two fingerprints is the size of the intersection of * their multisets, including repeated elements. See fingerprint_similarity for * examples. * * For ease of implementation, the fingerprint is implemented as a map * of byte pairs to the count of that byte pair in the string, instead of * allowing repeated elements in a set. */ struct fingerprint { … }; /* A byte pair in a fingerprint. Stores the number of times the byte pair * occurs in the string that the fingerprint represents. */ struct fingerprint_entry { … }; /* See `struct fingerprint` for an explanation of what a fingerprint is. * \param result the fingerprint of the string is stored here. This must be * freed later using free_fingerprint. * \param line_begin the start of the string * \param line_end the end of the string */ static void get_fingerprint(struct fingerprint *result, const char *line_begin, const char *line_end) { … } static void free_fingerprint(struct fingerprint *f) { … } /* Calculates the similarity between two fingerprints as the size of the * intersection of their multisets, including repeated elements. See * `struct fingerprint` for an explanation of the fingerprint representation. * The similarity between "cat mat" and "father rather" is 2 because "at" is * present twice in both strings while the similarity between "tim" and "mit" * is 0. */ static int fingerprint_similarity(struct fingerprint *a, struct fingerprint *b) { … } /* Subtracts byte-pair elements in B from A, modifying A in place. */ static void fingerprint_subtract(struct fingerprint *a, struct fingerprint *b) { … } /* Calculate fingerprints for a series of lines. * Puts the fingerprints in the fingerprints array, which must have been * preallocated to allow storing line_count elements. */ static void get_line_fingerprints(struct fingerprint *fingerprints, const char *content, const int *line_starts, long first_line, long line_count) { … } static void free_line_fingerprints(struct fingerprint *fingerprints, int nr_fingerprints) { … } /* This contains the data necessary to linearly map a line number in one half * of a diff chunk to the line in the other half of the diff chunk that is * closest in terms of its position as a fraction of the length of the chunk. */ struct line_number_mapping { … }; /* Given a line number in one range, offset and scale it to map it onto the * other range. * Essentially this mapping is a simple linear equation but the calculation is * more complicated to allow performing it with integer operations. * Another complication is that if a line could map onto many lines in the * destination range then we want to choose the line at the center of those * possibilities. * Example: if the chunk is 2 lines long in A and 10 lines long in B then the * first 5 lines in B will map onto the first line in the A chunk, while the * last 5 lines will all map onto the second line in the A chunk. * Example: if the chunk is 10 lines long in A and 2 lines long in B then line * 0 in B will map onto line 2 in A, and line 1 in B will map onto line 7 in A. */ static int map_line_number(int line_number, const struct line_number_mapping *mapping) { … } /* Get a pointer to the element storing the similarity between a line in A * and a line in B. * * The similarities are stored in a 2-dimensional array. Each "row" in the * array contains the similarities for a line in B. The similarities stored in * a row are the similarities between the line in B and the nearby lines in A. * To keep the length of each row the same, it is padded out with values of -1 * where the search range extends beyond the lines in A. * For example, if max_search_distance_a is 2 and the two sides of a diff chunk * look like this: * a | m * b | n * c | o * d | p * e | q * Then the similarity array will contain: * [-1, -1, am, bm, cm, * -1, an, bn, cn, dn, * ao, bo, co, do, eo, * bp, cp, dp, ep, -1, * cq, dq, eq, -1, -1] * Where similarities are denoted either by -1 for invalid, or the * concatenation of the two lines in the diff being compared. * * \param similarities array of similarities between lines in A and B * \param line_a the index of the line in A, in the same frame of reference as * closest_line_a. * \param local_line_b the index of the line in B, relative to the first line * in B that similarities represents. * \param closest_line_a the index of the line in A that is deemed to be * closest to local_line_b. This must be in the same * frame of reference as line_a. This value defines * where similarities is centered for the line in B. * \param max_search_distance_a maximum distance in lines from the closest line * in A for other lines in A for which * similarities may be calculated. */ static int *get_similarity(int *similarities, int line_a, int local_line_b, int closest_line_a, int max_search_distance_a) { … } #define CERTAIN_NOTHING_MATCHES … #define CERTAINTY_NOT_CALCULATED … /* Given a line in B, first calculate its similarities with nearby lines in A * if not already calculated, then identify the most similar and second most * similar lines. The "certainty" is calculated based on those two * similarities. * * \param start_a the index of the first line of the chunk in A * \param length_a the length in lines of the chunk in A * \param local_line_b the index of the line in B, relative to the first line * in the chunk. * \param fingerprints_a array of fingerprints for the chunk in A * \param fingerprints_b array of fingerprints for the chunk in B * \param similarities 2-dimensional array of similarities between lines in A * and B. See get_similarity() for more details. * \param certainties array of values indicating how strongly a line in B is * matched with some line in A. * \param second_best_result array of absolute indices in A for the second * closest match of a line in B. * \param result array of absolute indices in A for the closest match of a line * in B. * \param max_search_distance_a maximum distance in lines from the closest line * in A for other lines in A for which * similarities may be calculated. * \param map_line_number_in_b_to_a parameter to map_line_number(). */ static void find_best_line_matches( int start_a, int length_a, int start_b, int local_line_b, struct fingerprint *fingerprints_a, struct fingerprint *fingerprints_b, int *similarities, int *certainties, int *second_best_result, int *result, const int max_search_distance_a, const struct line_number_mapping *map_line_number_in_b_to_a) { … } /* * This finds the line that we can match with the most confidence, and * uses it as a partition. It then calls itself on the lines on either side of * that partition. In this way we avoid lines appearing out of order, and * retain a sensible line ordering. * \param start_a index of the first line in A with which lines in B may be * compared. * \param start_b index of the first line in B for which matching should be * done. * \param length_a number of lines in A with which lines in B may be compared. * \param length_b number of lines in B for which matching should be done. * \param fingerprints_a mutable array of fingerprints in A. The first element * corresponds to the line at start_a. * \param fingerprints_b array of fingerprints in B. The first element * corresponds to the line at start_b. * \param similarities 2-dimensional array of similarities between lines in A * and B. See get_similarity() for more details. * \param certainties array of values indicating how strongly a line in B is * matched with some line in A. * \param second_best_result array of absolute indices in A for the second * closest match of a line in B. * \param result array of absolute indices in A for the closest match of a line * in B. * \param max_search_distance_a maximum distance in lines from the closest line * in A for other lines in A for which * similarities may be calculated. * \param max_search_distance_b an upper bound on the greatest possible * distance between lines in B such that they will * both be compared with the same line in A * according to max_search_distance_a. * \param map_line_number_in_b_to_a parameter to map_line_number(). */ static void fuzzy_find_matching_lines_recurse( int start_a, int start_b, int length_a, int length_b, struct fingerprint *fingerprints_a, struct fingerprint *fingerprints_b, int *similarities, int *certainties, int *second_best_result, int *result, int max_search_distance_a, int max_search_distance_b, const struct line_number_mapping *map_line_number_in_b_to_a) { … } /* Find the lines in the parent line range that most closely match the lines in * the target line range. This is accomplished by matching fingerprints in each * blame_origin, and choosing the best matches that preserve the line ordering. * See struct fingerprint for details of fingerprint matching, and * fuzzy_find_matching_lines_recurse for details of preserving line ordering. * * The performance is believed to be O(n log n) in the typical case and O(n^2) * in a pathological case, where n is the number of lines in the target range. */ static int *fuzzy_find_matching_lines(struct blame_origin *parent, struct blame_origin *target, int tlno, int parent_slno, int same, int parent_len) { … } static void fill_origin_fingerprints(struct blame_origin *o) { … } static void drop_origin_fingerprints(struct blame_origin *o) { … } /* * Given an origin, prepare mmfile_t structure to be used by the * diff machinery */ static void fill_origin_blob(struct diff_options *opt, struct blame_origin *o, mmfile_t *file, int *num_read_blob, int fill_fingerprints) { … } static void drop_origin_blob(struct blame_origin *o) { … } /* * Any merge of blames happens on lists of blames that arrived via * different parents in a single suspect. In this case, we want to * sort according to the suspect line numbers as opposed to the final * image line numbers. The function body is somewhat longish because * it avoids unnecessary writes. */ static struct blame_entry *blame_merge(struct blame_entry *list1, struct blame_entry *list2) { … } DEFINE_LIST_SORT(…) …; /* * Final image line numbers are all different, so we don't need a * three-way comparison here. */ static int compare_blame_final(const struct blame_entry *e1, const struct blame_entry *e2) { … } static int compare_blame_suspect(const struct blame_entry *s1, const struct blame_entry *s2) { … } void blame_sort_final(struct blame_scoreboard *sb) { … } static int compare_commits_by_reverse_commit_date(const void *a, const void *b, void *c) { … } /* * For debugging -- origin is refcounted, and this asserts that * we do not underflow. */ static void sanity_check_refcnt(struct blame_scoreboard *sb) { … } /* * If two blame entries that are next to each other came from * contiguous lines in the same origin (i.e. <commit, path> pair), * merge them together. */ void blame_coalesce(struct blame_scoreboard *sb) { … } /* * Merge the given sorted list of blames into a preexisting origin. * If there were no previous blames to that commit, it is entered into * the commit priority queue of the score board. */ static void queue_blames(struct blame_scoreboard *sb, struct blame_origin *porigin, struct blame_entry *sorted) { … } /* * Fill the blob_sha1 field of an origin if it hasn't, so that later * call to fill_origin_blob() can use it to locate the data. blob_sha1 * for an origin is also used to pass the blame for the entire file to * the parent to detect the case where a child's blob is identical to * that of its parent's. * * This also fills origin->mode for corresponding tree path. */ static int fill_blob_sha1_and_mode(struct repository *r, struct blame_origin *origin) { … } struct blame_bloom_data { … }; static int bloom_count_queries = …; static int bloom_count_no = …; static int maybe_changed_path(struct repository *r, struct blame_origin *origin, struct blame_bloom_data *bd) { … } static void add_bloom_key(struct blame_bloom_data *bd, const char *path) { … } /* * We have an origin -- check if the same path exists in the * parent and return an origin structure to represent it. */ static struct blame_origin *find_origin(struct repository *r, struct commit *parent, struct blame_origin *origin, struct blame_bloom_data *bd) { … } /* * We have an origin -- find the path that corresponds to it in its * parent and return an origin structure to represent it. */ static struct blame_origin *find_rename(struct repository *r, struct commit *parent, struct blame_origin *origin, struct blame_bloom_data *bd) { … } /* * Append a new blame entry to a given output queue. */ static void add_blame_entry(struct blame_entry ***queue, const struct blame_entry *src) { … } /* * src typically is on-stack; we want to copy the information in it to * a malloced blame_entry that gets added to the given queue. The * origin of dst loses a refcnt. */ static void dup_entry(struct blame_entry ***queue, struct blame_entry *dst, struct blame_entry *src) { … } const char *blame_nth_line(struct blame_scoreboard *sb, long lno) { … } /* * It is known that lines between tlno to same came from parent, and e * has an overlap with that range. it also is known that parent's * line plno corresponds to e's line tlno. * * <---- e -----> * <------> * <------------> * <------------> * <------------------> * * Split e into potentially three parts; before this chunk, the chunk * to be blamed for the parent, and after that portion. */ static void split_overlap(struct blame_entry *split, struct blame_entry *e, int tlno, int plno, int same, struct blame_origin *parent) { … } /* * split_overlap() divided an existing blame e into up to three parts * in split. Any assigned blame is moved to queue to * reflect the split. */ static void split_blame(struct blame_entry ***blamed, struct blame_entry ***unblamed, struct blame_entry *split, struct blame_entry *e) { … } /* * After splitting the blame, the origins used by the * on-stack blame_entry should lose one refcnt each. */ static void decref_split(struct blame_entry *split) { … } /* * reverse_blame reverses the list given in head, appending tail. * That allows us to build lists in reverse order, then reverse them * afterwards. This can be faster than building the list in proper * order right away. The reason is that building in proper order * requires writing a link in the _previous_ element, while building * in reverse order just requires placing the list head into the * _current_ element. */ static struct blame_entry *reverse_blame(struct blame_entry *head, struct blame_entry *tail) { … } /* * Splits a blame entry into two entries at 'len' lines. The original 'e' * consists of len lines, i.e. [e->lno, e->lno + len), and the second part, * which is returned, consists of the remainder: [e->lno + len, e->lno + * e->num_lines). The caller needs to sort out the reference counting for the * new entry's suspect. */ static struct blame_entry *split_blame_at(struct blame_entry *e, int len, struct blame_origin *new_suspect) { … } struct blame_line_tracker { … }; static int are_lines_adjacent(struct blame_line_tracker *first, struct blame_line_tracker *second) { … } static int scan_parent_range(struct fingerprint *p_fps, struct fingerprint *t_fps, int t_idx, int from, int nr_lines) { … } /* * The first pass checks the blame entry (from the target) against the parent's * diff chunk. If that fails for a line, the second pass tries to match that * line to any part of parent file. That catches cases where a change was * broken into two chunks by 'context.' */ static void guess_line_blames(struct blame_origin *parent, struct blame_origin *target, int tlno, int offset, int same, int parent_len, struct blame_line_tracker *line_blames) { … } /* * This decides which parts of a blame entry go to the parent (added to the * ignoredp list) and which stay with the target (added to the diffp list). The * actual decision was made in a separate heuristic function, and those answers * for the lines in 'e' are in line_blames. This consumes e, essentially * putting it on a list. * * Note that the blame entries on the ignoredp list are not necessarily sorted * with respect to the parent's line numbers yet. */ static void ignore_blame_entry(struct blame_entry *e, struct blame_origin *parent, struct blame_entry **diffp, struct blame_entry **ignoredp, struct blame_line_tracker *line_blames) { … } /* * Process one hunk from the patch between the current suspect for * blame_entry e and its parent. This first blames any unfinished * entries before the chunk (which is where target and parent start * differing) on the parent, and then splits blame entries at the * start and at the end of the difference region. Since use of -M and * -C options may lead to overlapping/duplicate source line number * ranges, all we can rely on from sorting/merging is the order of the * first suspect line number. * * tlno: line number in the target where this chunk begins * same: line number in the target where this chunk ends * offset: add to tlno to get the chunk starting point in the parent * parent_len: number of lines in the parent chunk */ static void blame_chunk(struct blame_entry ***dstq, struct blame_entry ***srcq, int tlno, int offset, int same, int parent_len, struct blame_origin *parent, struct blame_origin *target, int ignore_diffs) { … } struct blame_chunk_cb_data { … }; /* diff chunks are from parent to target */ static int blame_chunk_cb(long start_a, long count_a, long start_b, long count_b, void *data) { … } /* * We are looking at the origin 'target' and aiming to pass blame * for the lines it is suspected to its parent. Run diff to find * which lines came from parent and pass blame for them. */ static void pass_blame_to_parent(struct blame_scoreboard *sb, struct blame_origin *target, struct blame_origin *parent, int ignore_diffs) { … } /* * The lines in blame_entry after splitting blames many times can become * very small and trivial, and at some point it becomes pointless to * blame the parents. E.g. "\t\t}\n\t}\n\n" appears everywhere in any * ordinary C program, and it is not worth to say it was copied from * totally unrelated file in the parent. * * Compute how trivial the lines in the blame_entry are. */ unsigned blame_entry_score(struct blame_scoreboard *sb, struct blame_entry *e) { … } /* * best_so_far[] and potential[] are both a split of an existing blame_entry * that passes blame to the parent. Maintain best_so_far the best split so * far, by comparing potential and best_so_far and copying potential into * bst_so_far as needed. */ static void copy_split_if_better(struct blame_scoreboard *sb, struct blame_entry *best_so_far, struct blame_entry *potential) { … } /* * We are looking at a part of the final image represented by * ent (tlno and same are offset by ent->s_lno). * tlno is where we are looking at in the final image. * up to (but not including) same match preimage. * plno is where we are looking at in the preimage. * * <-------------- final image ----------------------> * <------ent------> * ^tlno ^same * <---------preimage-----> * ^plno * * All line numbers are 0-based. */ static void handle_split(struct blame_scoreboard *sb, struct blame_entry *ent, int tlno, int plno, int same, struct blame_origin *parent, struct blame_entry *split) { … } struct handle_split_cb_data { … }; static int handle_split_cb(long start_a, long count_a, long start_b, long count_b, void *data) { … } /* * Find the lines from parent that are the same as ent so that * we can pass blames to it. file_p has the blob contents for * the parent. */ static void find_copy_in_blob(struct blame_scoreboard *sb, struct blame_entry *ent, struct blame_origin *parent, struct blame_entry *split, mmfile_t *file_p) { … } /* Move all blame entries from list *source that have a score smaller * than score_min to the front of list *small. * Returns a pointer to the link pointing to the old head of the small list. */ static struct blame_entry **filter_small(struct blame_scoreboard *sb, struct blame_entry **small, struct blame_entry **source, unsigned score_min) { … } /* * See if lines currently target is suspected for can be attributed to * parent. */ static void find_move_in_parent(struct blame_scoreboard *sb, struct blame_entry ***blamed, struct blame_entry **toosmall, struct blame_origin *target, struct blame_origin *parent) { … } struct blame_list { … }; /* * Count the number of entries the target is suspected for, * and prepare a list of entry and the best split. */ static struct blame_list *setup_blame_list(struct blame_entry *unblamed, int *num_ents_p) { … } /* * For lines target is suspected for, see if we can find code movement * across file boundary from the parent commit. porigin is the path * in the parent we already tried. */ static void find_copy_in_parent(struct blame_scoreboard *sb, struct blame_entry ***blamed, struct blame_entry **toosmall, struct blame_origin *target, struct commit *parent, struct blame_origin *porigin, int opt) { … } /* * The blobs of origin and porigin exactly match, so everything * origin is suspected for can be blamed on the parent. */ static void pass_whole_blame(struct blame_scoreboard *sb, struct blame_origin *origin, struct blame_origin *porigin) { … } /* * We pass blame from the current commit to its parents. We keep saying * "parent" (and "porigin"), but what we mean is to find scapegoat to * exonerate ourselves. */ static struct commit_list *first_scapegoat(struct rev_info *revs, struct commit *commit, int reverse) { … } static int num_scapegoats(struct rev_info *revs, struct commit *commit, int reverse) { … } /* Distribute collected unsorted blames to the respected sorted lists * in the various origins. */ static void distribute_blame(struct blame_scoreboard *sb, struct blame_entry *blamed) { … } #define MAXSG … blame_find_alg; static void pass_blame(struct blame_scoreboard *sb, struct blame_origin *origin, int opt) { … } /* * The main loop -- while we have blobs with lines whose true origin * is still unknown, pick one blob, and allow its lines to pass blames * to its parents. */ void assign_blame(struct blame_scoreboard *sb, int opt) { … } /* * To allow quick access to the contents of nth line in the * final image, prepare an index in the scoreboard. */ static int prepare_lines(struct blame_scoreboard *sb) { … } static struct commit *find_single_final(struct rev_info *revs, const char **name_p) { … } static struct commit *dwim_reverse_initial(struct rev_info *revs, const char **name_p) { … } static struct commit *find_single_initial(struct rev_info *revs, const char **name_p) { … } void init_scoreboard(struct blame_scoreboard *sb) { … } void setup_scoreboard(struct blame_scoreboard *sb, struct blame_origin **orig) { … } struct blame_entry *blame_entry_prepend(struct blame_entry *head, long start, long end, struct blame_origin *o) { … } void setup_blame_bloom_data(struct blame_scoreboard *sb) { … } void cleanup_scoreboard(struct blame_scoreboard *sb) { … }