Dulwich.io dulwich / 3435760
Merge branch 'line-ending-convert-support-bases' of git://github.com/comet-ml/dulwich Jelmer Vernooń≥ 13 days ago
3 changed file(s) with 275 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
0 # line_ending.py -- Line ending conversion functions
1 # Copyright (C) 2018-2018 Boris Feld <boris.feld@comet.ml>
2 #
3 # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
4 # General Public License as public by the Free Software Foundation; version 2.0
5 # or (at your option) any later version. You can redistribute it and/or
6 # modify it under the terms of either of these two licenses.
7 #
8 # Unless required by applicable law or agreed to in writing, software
9 # distributed under the License is distributed on an "AS IS" BASIS,
10 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 # See the License for the specific language governing permissions and
12 # limitations under the License.
13 #
14 # You should have received a copy of the licenses; if not, see
15 # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
16 # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
17 # License, Version 2.0.
18 #
19 """ All line-ending related functions, from conversions to config processing
20
21 Line-ending normalization is a complex beast. Here is some notes and details
22 about how it seems to work.
23
24 The normalization is a two-fold process that happens at two moments:
25
26 - When reading a file from the index and to the working directory. For example
27 when doing a `git clone` or `git checkout` call. We call this process the
28 read filter in this module.
29 - When writing a file to the index from the working directory. For example
30 when doing a `git add` call. We call this process the write filter in this
31 module.
32
33 One thing to know is that Git does line-ending normalization only on text
34 files. How does Git know that a file is text? We can either mark a file as a
35 text file, a binary file or ask Git to automatically decides. Git has an
36 heuristic to detect if a file is a text file or a binary file. It seems based
37 on the percentage of non-printable characters in files.
38
39 The code for this heuristic is here:
40 https://git.kernel.org/pub/scm/git/git.git/tree/convert.c#n46
41
42 Dulwich have an implementation with a slightly different heuristic, the
43 `is_binary` function in `dulwich.patch`.
44
45 The binary detection heuristic implementation is close to the one in JGit:
46 https://github.com/eclipse/jgit/blob/f6873ffe522bbc3536969a3a3546bf9a819b92bf/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java#L300
47
48 There is multiple variables that impact the normalization.
49
50 First, a repository can contains a `.gitattributes` file (or more than one...)
51 that can further customize the operation on some file patterns, for example:
52
53 *.txt text
54
55 Force all `.txt` files to be treated as text files and to have their lines
56 endings normalized.
57
58 *.jpg -text
59
60 Force all `.jpg` files to be treated as binary files and to not have their
61 lines endings converted.
62
63 *.vcproj text eol=crlf
64
65 Force all `.vcproj` files to be treated as text files and to have their lines
66 endings converted into `CRLF` in working directory no matter the native EOL of
67 the platform.
68
69 *.sh text eol=lf
70
71 Force all `.sh` files to be treated as text files and to have their lines
72 endings converted into `LF` in working directory no matter the native EOL of
73 the platform.
74
75 If the `eol` attribute is not defined, Git uses the `core.eol` configuration
76 value described later.
77
78 * text=auto
79
80 Force all files to be scanned by the text file heuristic detection and to have
81 their line endings normalized in case they are detected as text files.
82
83 Git also have a obsolete attribute named `crlf` that can be translated to the
84 corresponding text attribute value.
85
86 Then there are some configuration option (that can be defined at the
87 repository or user level):
88
89 - core.autocrlf
90 - core.eol
91
92 `core.autocrlf` is taken into account for all files that doesn't have a `text`
93 attribute defined in `.gitattributes`; it takes three possible values:
94
95 - `true`: This forces all files on the working directory to have CRLF
96 line-endings in the working directory and convert line-endings to LF
97 when writing to the index. When autocrlf is set to true, eol value is
98 ignored.
99 - `input`: Quite similar to the `true` value but only force the write
100 filter, ie line-ending of new files added to the index will get their
101 line-endings converted to LF.
102 - `false` (default): No normalization is done.
103
104 `core.eol` is the top-level configuration to define the line-ending to use
105 when applying the read_filer. It takes three possible values:
106
107 - `lf`: When normalization is done, force line-endings to be `LF` in the
108 working directory.
109 - `crlf`: When normalization is done, force line-endings to be `CRLF` in
110 the working directory.
111 - `native` (default): When normalization is done, force line-endings to be
112 the platform's native line ending.
113
114 One thing to remember is when line-ending normalization is done on a file, Git
115 always normalize line-ending to `LF` when writing to the index.
116
117 There are sources that seems to indicate that Git won't do line-ending
118 normalization when a file contains mixed line-endings. I think this logic
119 might be in text / binary detection heuristic but couldn't find it yet.
120
121 Sources:
122 - https://git-scm.com/docs/git-config#git-config-coreeol
123 - https://git-scm.com/docs/git-config#git-config-coreautocrlf
124 - https://git-scm.com/docs/gitattributes#_checking_out_and_checking_in
125 - https://adaptivepatchwork.com/2012/03/01/mind-the-end-of-your-line/
126 """
127
128 CRLF = b"\r\n"
129 LF = b"\n"
130
131
132 def convert_crlf_to_lf(text_hunk):
133 """Convert CRLF in text hunk into LF
134
135 :param text_hunk: A bytes string representing a text hunk
136 :return: The text hunk with the same type, with CRLF replaced into LF
137 """
138 return text_hunk.replace(CRLF, LF)
139
140
141 def convert_lf_to_crlf(text_hunk):
142 """Convert LF in text hunk into CRLF
143
144 :param text_hunk: A bytes string representing a text hunk
145 :return: The text hunk with the same type, with LF replaced into CRLF
146 """
147 # TODO find a more efficient way of doing it
148 intermediary = text_hunk.replace(CRLF, LF)
149 return intermediary.replace(LF, CRLF)
150
151
152 def get_checkout_filter_autocrlf(core_autocrlf):
153 """ Returns the correct checkout filter base on autocrlf value
154
155 :param core_autocrlf: The bytes configuration value of core.autocrlf.
156 Valid values are: b'true', b'false' or b'input'.
157 :return: Either None if no filter has to be applied or a function
158 accepting a single argument, a binary text hunk
159 """
160
161 if core_autocrlf == b"true":
162 return convert_lf_to_crlf
163
164 return None
165
166
167 def get_checkin_filter_autocrlf(core_autocrlf):
168 """ Returns the correct checkin filter base on autocrlf value
169
170 :param core_autocrlf: The bytes configuration value of core.autocrlf.
171 Valid values are: b'true', b'false' or b'input'.
172 :return: Either None if no filter has to be applied or a function
173 accepting a single argument, a binary text hunk
174 """
175
176 if core_autocrlf == b"true" or core_autocrlf == b"input":
177 return convert_crlf_to_lf
178
179 # Checking filter should never be `convert_lf_to_crlf`
180 return None
109109 'hooks',
110110 'ignore',
111111 'index',
112 'line_ending',
112113 'lru_cache',
113114 'mailmap',
114115 'objects',
0 # -*- coding: utf-8 -*-
1 # test_line_ending.py -- Tests for the line ending functions
2 # encoding: utf-8
3 # Copyright (C) 2018-2019 Boris Feld <boris.feld@comet.ml>
4 #
5 # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
6 # General Public License as public by the Free Software Foundation; version 2.0
7 # or (at your option) any later version. You can redistribute it and/or
8 # modify it under the terms of either of these two licenses.
9 #
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and
14 # limitations under the License.
15 #
16 # You should have received a copy of the licenses; if not, see
17 # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
18 # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
19 # License, Version 2.0.
20 #
21
22 """Tests for the line ending conversion."""
23
24 from dulwich.line_ending import (
25 convert_crlf_to_lf,
26 convert_lf_to_crlf,
27 get_checkin_filter_autocrlf,
28 get_checkout_filter_autocrlf,
29 )
30 from dulwich.tests import TestCase
31
32
33 class LineEndingConversion(TestCase):
34 """Test the line ending conversion functions in various cases"""
35
36 def test_convert_crlf_to_lf_no_op(self):
37 self.assertEqual(convert_crlf_to_lf(b"foobar"), b"foobar")
38
39 def test_convert_crlf_to_lf(self):
40 self.assertEqual(
41 convert_crlf_to_lf(b"line1\r\nline2"), b"line1\nline2"
42 )
43
44 def test_convert_crlf_to_lf_mixed(self):
45 self.assertEqual(
46 convert_crlf_to_lf(b"line1\r\n\nline2"), b"line1\n\nline2"
47 )
48
49 def test_convert_lf_to_crlf_no_op(self):
50 self.assertEqual(convert_lf_to_crlf(b"foobar"), b"foobar")
51
52 def test_convert_lf_to_crlf(self):
53 self.assertEqual(
54 convert_lf_to_crlf(b"line1\nline2"), b"line1\r\nline2"
55 )
56
57 def test_convert_lf_to_crlf_mixed(self):
58 self.assertEqual(
59 convert_lf_to_crlf(b"line1\r\n\nline2"), b"line1\r\n\r\nline2"
60 )
61
62
63 class GetLineEndingAutocrlfFilters(TestCase):
64 def test_get_checkin_filter_autocrlf_default(self):
65 checkin_filter = get_checkin_filter_autocrlf(b"false")
66
67 self.assertEqual(checkin_filter, None)
68
69 def test_get_checkin_filter_autocrlf_true(self):
70 checkin_filter = get_checkin_filter_autocrlf(b"true")
71
72 self.assertEqual(checkin_filter, convert_crlf_to_lf)
73
74 def test_get_checkin_filter_autocrlf_input(self):
75 checkin_filter = get_checkin_filter_autocrlf(b"input")
76
77 self.assertEqual(checkin_filter, convert_crlf_to_lf)
78
79 def test_get_checkout_filter_autocrlf_default(self):
80 checkout_filter = get_checkout_filter_autocrlf(b"false")
81
82 self.assertEqual(checkout_filter, None)
83
84 def test_get_checkout_filter_autocrlf_true(self):
85 checkout_filter = get_checkout_filter_autocrlf(b"true")
86
87 self.assertEqual(checkout_filter, convert_lf_to_crlf)
88
89 def test_get_checkout_filter_autocrlf_input(self):
90 checkout_filter = get_checkout_filter_autocrlf(b"input")
91
92 self.assertEqual(checkout_filter, None)