@@ -64,7 +64,7 @@ type cssClassCollectorWriter struct {
64
64
buff bytes.Buffer
65
65
66
66
isCollecting bool
67
- dropValue bool
67
+ inPreTag string
68
68
69
69
inQuote bool
70
70
quoteValue byte
@@ -90,56 +90,70 @@ func (w *cssClassCollectorWriter) Write(p []byte) (n int, err error) {
90
90
b := p [i ]
91
91
w .toggleIfQuote (b )
92
92
if ! w .inQuote && b == '>' {
93
- w .endCollecting (false )
93
+ w .endCollecting ()
94
94
break
95
95
}
96
96
w .buff .WriteByte (b )
97
97
}
98
98
99
99
if ! w .isCollecting {
100
- if w .dropValue {
101
- w .buff .Reset ()
102
- } else {
103
- // First check if we have processed this element before.
104
- w .collector .mu .RLock ()
105
-
106
- // See https://github.com/dominikh/go-tools/issues/723
107
- //lint:ignore S1030 This construct avoids memory allocation for the string.
108
- seen := w .collector .elementSet [string (w .buff .Bytes ())]
109
- w .collector .mu .RUnlock ()
110
- if seen {
111
- w .buff .Reset ()
112
- continue
100
+ if w .inPreTag != "" {
101
+ s := w .buff .String ()
102
+ if tagName , isEnd := w .parseEndTag (s ); isEnd && w .inPreTag == tagName {
103
+ w .inPreTag = ""
113
104
}
105
+ w .buff .Reset ()
106
+ continue
107
+ }
114
108
115
- s := w .buff .String ()
109
+ // First check if we have processed this element before.
110
+ w .collector .mu .RLock ()
116
111
112
+ // See https://github.com/dominikh/go-tools/issues/723
113
+ //lint:ignore S1030 This construct avoids memory allocation for the string.
114
+ seen := w .collector .elementSet [string (w .buff .Bytes ())]
115
+ w .collector .mu .RUnlock ()
116
+ if seen {
117
117
w .buff .Reset ()
118
+ continue
119
+ }
118
120
119
- if strings .HasPrefix (s , "</" ) {
120
- continue
121
- }
121
+ s := w .buff .String ()
122
122
123
- key := s
123
+ w . buff . Reset ()
124
124
125
- s , tagName := w . insertStandinHTMLElement ( s )
126
- el := parseHTMLElement ( s )
127
- el . Tag = tagName
125
+ if strings . HasPrefix ( s , "</" ) {
126
+ continue
127
+ }
128
128
129
- w .collector .mu .Lock ()
130
- w .collector .elementSet [key ] = true
131
- if el .Tag != "" {
132
- w .collector .elements = append (w .collector .elements , el )
133
- }
134
- w .collector .mu .Unlock ()
129
+ key := s
130
+
131
+ s , tagName := w .insertStandinHTMLElement (s )
132
+ el := parseHTMLElement (s )
133
+ el .Tag = tagName
134
+ if w .isPreFormatted (tagName ) {
135
+ w .inPreTag = tagName
135
136
}
137
+
138
+ w .collector .mu .Lock ()
139
+ w .collector .elementSet [key ] = true
140
+ if el .Tag != "" {
141
+ w .collector .elements = append (w .collector .elements , el )
142
+ }
143
+ w .collector .mu .Unlock ()
144
+
136
145
}
137
146
}
138
147
}
139
148
140
149
return
141
150
}
142
151
152
+ // No need to look inside these for HTML elements.
153
+ func (c * cssClassCollectorWriter ) isPreFormatted (s string ) bool {
154
+ return s == "pre" || s == "textarea" || s == "script"
155
+ }
156
+
143
157
// The net/html parser does not handle single table elements as input, e.g. tbody.
144
158
// We only care about the element/class/ids, so just store away the original tag name
145
159
// and pretend it's a <div>.
@@ -154,15 +168,24 @@ func (c *cssClassCollectorWriter) insertStandinHTMLElement(el string) (string, s
154
168
return newv , strings .ToLower (tag )
155
169
}
156
170
157
- func (c * cssClassCollectorWriter ) endCollecting (drop bool ) {
171
+ func (c * cssClassCollectorWriter ) parseEndTag (s string ) (string , bool ) {
172
+ if ! strings .HasPrefix (s , "</" ) {
173
+ return "" , false
174
+ }
175
+ s = strings .TrimPrefix (s , "</" )
176
+ s = strings .TrimSuffix (s , ">" )
177
+ return strings .ToLower (strings .TrimSpace (s )), true
178
+ }
179
+
180
+ func (c * cssClassCollectorWriter ) endCollecting () {
158
181
c .isCollecting = false
159
182
c .inQuote = false
160
- c . dropValue = drop
183
+
161
184
}
162
185
163
186
func (c * cssClassCollectorWriter ) startCollecting () {
164
187
c .isCollecting = true
165
- c . dropValue = false
188
+
166
189
}
167
190
168
191
func (c * cssClassCollectorWriter ) toggleIfQuote (b byte ) {
0 commit comments