Searching GitHub for regular expression matches in code is a tremendous task.

Prerequisites

Set up the bq command

https://cloud.google.com/bigquery/docs/bq-command-line-tool

Lots of money

About US$5 per search.

This is cheaper than searching all files.

shell variable function
$query a regular expression that searches the contents of files
$path_re a regex that matches on the file path
$path_re_exclude a regex that matches on the file path for pruning results
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
#standardSQL
SELECT
  sample_repo_name,
  sample_path,
  (SELECT STRING_AGG(snip) FROM snippets.snip) AS snippets
FROM (
  SELECT
    sample_repo_name, sample_path, REGEXP_EXTRACT_ALL(content, r'.*$query.*') AS snip
  FROM (
    SELECT sample_repo_name, content, sample_path FROM \`bigquery-public-data.github_repos.sample_contents\`
        WHERE TRUE
        $(
        if [ -n "$query" ]; then
            lit "AND REGEXP_CONTAINS(content, r'$query')"
        fi

        if [ -n "$path_re" ]; then
            lit "AND REGEXP_CONTAINS(sample_path, r'$path_re')"
        fi

        if [ -n "$exclude_path_re" ]; then
            lit "AND NOT REGEXP_CONTAINS(sample_path, r'$exclude_path_re')"
        fi
        )
  )
) snippets LIMIT $limit

Full contents search – Search all files

About US$20 per search.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#standardSQL
SELECT
  repo_name,
  path,
  (SELECT STRING_AGG(snip) FROM snippets.snip) AS snippets
FROM (
  SELECT
    files.repo_name, files.path,
        $(
        if [ -n "$query" ]; then
            lit "REGEXP_EXTRACT_ALL(contents.content, r'.*$query.*') AS snip"
        else
            lit "\"\" AS snip"
        fi
        )
  FROM (
    # test the file size in tiny (snippet size)
    SELECT id, size, content, binary FROM \`bigquery-public-data.github_repos.contents\`
        WHERE TRUE
        AND binary = false
        AND size < $MAX_FILE_SIZE
        $(
        if [ -n "$query" ]; then
            lit "AND REGEXP_CONTAINS(content, r'$query')"
        fi
        )
  ) as contents
  JOIN
  (
    SELECT repo_name, id, path FROM \`bigquery-public-data.github_repos.files\`
        WHERE TRUE
        $(
        if [ -n "$path_re" ]; then
            lit "AND REGEXP_CONTAINS(path, r'$path_re')"
        fi

        if [ -n "$exclude_path_re" ]; then
            lit "AND NOT REGEXP_CONTAINS(path, r'$exclude_path_re')"
        fi
    )
  ) as files
  ON contents.id = files.id
) snippets LIMIT $limit

Using GHTorrent

Using GHTorrent we can sort repositories according to the number of stars.

We can also reduce the amount of repositories searched with regex by limiting our search to repositories of a specific language.

This will make the results cheaper and more relevant.

shell variable function
$language specifying the language

Join two big query tables with #standardSQL

We will do something like this but for ghtorrent and bigquery-public-data.github_repos.

1
2
3
4
5
6
select sum(a.is_male)
from
(select is_male, year from publicdata.samples.natality) a
inner join
(select year from moshap.my_years) b
on a.year = b.year

Results

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#standardSQL
SELECT
  contents_search_results.repo_name,
  contents_search_results.path,
  contents_search_results.final_snippets,
  contents_search_results.repo_id,
  contents_search_results.forked_from,
  ranking_info.stars
FROM
(
    SELECT
      final_contents_table.repo_name,
      final_contents_table.path,
      final_contents_table.final_snippets,
      final_contents_table.repo_id,
      final_contents_table.forked_from
    FROM
    (
      SELECT
        ght.id as repo_id,
        pd.repo_name as repo_name,
        pd.path as path,
        pd.bqsnippets as final_snippets,
        ght.forked_from as forked_from
      FROM
      (
      SELECT id, REGEXP_EXTRACT(url, r"[^/]+/[^/]+$") as repo_name, forked_from FROM \`ghtorrent-bq.ght_2018_04_01.projects\` WHERE $(if test -n "$language"; then echo -n "language = \"$language\" AND"; fi) forked_from IS NULL
      ) ght
      JOIN
      (
        SELECT
          repo_name,
          path,
          (SELECT STRING_AGG(snip) FROM snippets.snip) AS bqsnippets
        FROM (
          SELECT
            files.repo_name, files.path,
                $(
                if [ -n "$query" ]; then
                    lit "REGEXP_EXTRACT_ALL(contents.content, r'.*$query.*') AS snip"
                else
                    lit "\"\" AS snip"
                fi
                )
          FROM (
            # test the file size in tiny (snippet size)
            SELECT id, size, content, binary FROM \`bigquery-public-data.github_repos.contents\`
                WHERE TRUE
                AND binary = false
                AND size < $MAX_FILE_SIZE
                $(
                if [ -n "$query" ]; then
                    lit "AND REGEXP_CONTAINS(content, r'$query')"
                fi
                )
          ) as contents
          JOIN
          (
            SELECT repo_name, id, path FROM \`bigquery-public-data.github_repos.files\`
                WHERE TRUE
                $(
                if [ -n "$path_re" ]; then
                    lit "AND REGEXP_CONTAINS(path, r'$path_re')"
                fi
                )

                $(
                if [ -n "$exclude_path_re" ]; then
                    lit "AND NOT REGEXP_CONTAINS(path, r'$exclude_path_re')"
                fi
                )
          ) as files
          ON contents.id = files.id
        ) snippets
      ) pd
      ON pd.repo_name = ght.repo_name
    ) final_contents_table
) contents_search_results
JOIN
(
  SELECT repo_id, COUNT(DISTINCT user_id) as stars FROM \`ghtorrent-bq.ght_2018_04_01.watchers\`
      GROUP BY repo_id
) ranking_info
ON contents_search_results.repo_id = ranking_info.repo_id
ORDER BY stars DESC
LIMIT $limit

Generate the org-mode document with all results

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
lit "* query"
lit "#+BEGIN_SRC sql" -n
lit "$sql" | indent 2
lit "#+END_SRC"
lit
lit "* results"
lit "$sql" | ds -q gh-query-sql | tee /tmp/wizard-last-query.txt | ci bq -q --format=csv query --max_rows=100000 | ds -q bq-query | {
    count2=0
    awk 1 | sed 1,2d | while IFS=$'\n' read -r line; do
        repo="$(lit "$line" | awk -F, '{print $1}')"
        path="$(lit "$line" | awk -F, '{print $2}')"
        stars="$(lit "$line" | awk -F, '{print $6}')"
        snips="$(lit "$line" | sed 's/^[^,]\+,[^,]\+,//')"

        link="https://github.com/$repo/blob/master/${path}#L2-L10"
        rawlink="https://raw.githubusercontent.com/$repo/master/$path"

        if url-exists.js "$rawlink"; then
            # td_tempdir="$(mktemp -t -d td_tempdirXXXXXX || echo /dev/null)"
            td_tempdir="$NOTES/programs/gh-query/$slug"
            mkdir -p "$td_tempdir"
            wget -P "$td_tempdir" "$rawlink" &>/dev/null
            fpath="$(wfind "$td_tempdir")"
        fi

        bn="$(basename "$path")"
        lit "** ${count2}: $repo"
        lit "|stars"
        lit "|-"
        lit "|$stars"

        # lit "+ line : $line"
        lit "+ link :: $(printf -- "%s\n" "$link" | get_link)"
        lit "+ raw :: [[$rawlink][$bn]]"
        if [ -n "$fpath" ]; then
            lit "+ local :: [[$fpath][$fpath]]"
        fi
        lit "#+BEGIN_SRC text" -n
        lit "$snips" | format_snippet
        lit "#+END_SRC"
        lit

        count2=$(( $count2 + 1 ))
    done
} | ds -q gh-query

Annex

Tables used

1
2
3
bq show bigquery-public-data:github_repos.contents
bq show bigquery-public-data:samples.wikipedia
bq show --format=prettyjson bigquery-public-data:samples.wikipedia | jq '.schema.fields'

Download the schema

1
2
3
4
5
bq show --format json bigquery-public-data:samples.wikipedia
bq show --format json bigquery-public-data:github_repos.sample_contents | python -m json.tool
bq show --format=prettyjson bigquery-public-data:samples.wikipedia | jq '.schema.fields'
bq show --format=prettyjson bigquery-public-data:github_repos.sample_contents | jq '.schema.fields'
bq show --schema --format=prettyjson bigquery-public-data:github_repos.sample_contents