Skip to content

Commit a22d43a

Browse files
committed
Improve text extraction from HTML
1 parent 4e897b9 commit a22d43a

File tree

1 file changed

+72
-14
lines changed

1 file changed

+72
-14
lines changed

src/WinWebDiffLib/WebWindow.hpp

Lines changed: 72 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -539,38 +539,46 @@ class CWebWindow
539539
return hr;
540540
}
541541

542-
HRESULT SaveText(FILE *fp, const WValue& value)
542+
HRESULT SaveText(FILE *fp, const WValue& value, size_t& textLength)
543543
{
544544
const int nodeType = value[L"nodeType"].GetInt();
545-
const auto* nodeName = value[L"nodeName"].GetString();
546-
const bool fInline = IsInlineElement(nodeName);
547545

548-
if (value[L"nodeType"].GetInt() == 3 /* #text */)
546+
if (nodeType == 3 /* TEXT_NODE */)
549547
{
550-
if (fwprintf(fp, L"%s", value[L"nodeValue"].GetString()) < 0)
548+
std::wstring text = value[L"nodeValue"].GetString();
549+
text =
550+
((text.length() > 0 && iswspace(text.front())) ? L" " : L"") +
551+
trim_ws(text) +
552+
((text.length() > 0 && iswspace(text.back())) ? L" " : L"");
553+
if (fwprintf(fp, L"%s", text.c_str()) < 0)
551554
return HRESULT_FROM_WIN32(GetLastError());
555+
textLength += text.length();
552556
}
553557
if (value.HasMember(L"children") && value[L"children"].IsArray())
554558
{
559+
const auto* nodeName = value[L"nodeName"].GetString();
560+
const bool fInline = IsInlineElement(nodeName);
555561
if (wcscmp(nodeName, L"SCRIPT") != 0 && wcscmp(nodeName, L"STYLE") != 0)
556562
{
557-
int textCount = 0;
563+
if (nodeType == 1)
564+
{
565+
if ((!fInline && textLength > 0) || wcscmp(nodeName, L"BR") == 0 || wcscmp(nodeName, L"HR") == 0)
566+
{
567+
fwprintf(fp, L"\n");
568+
textLength = 0;
569+
}
570+
}
558571
for (const auto& child : value[L"children"].GetArray())
559572
{
560-
int childNodeType = child[L"nodeType"].GetInt();
561-
if (childNodeType == 3)
562-
textCount++;
563-
HRESULT hr = SaveText(fp, child);
573+
HRESULT hr = SaveText(fp, child, textLength);
564574
if (FAILED(hr))
565575
return hr;
566576
}
567-
if ((!fInline && textCount > 0) || wcscmp(nodeName, L"BR") == 0 || wcscmp(nodeName, L"HR") == 0)
568-
fwprintf(fp, L"\n");
569577
}
570578
}
571579
if (value.HasMember(L"contentDocument"))
572580
{
573-
HRESULT hr = SaveText(fp, value[L"contentDocument"]);
581+
HRESULT hr = SaveText(fp, value[L"contentDocument"], textLength);
574582
if (FAILED(hr))
575583
return hr;
576584
}
@@ -593,7 +601,8 @@ class CWebWindow
593601
document.Parse(returnObjectAsJson);
594602
wil::unique_file fp;
595603
_wfopen_s(&fp, filename.c_str(), L"at,ccs=UTF-8");
596-
hr = SaveText(fp.get(), document[L"root"]);
604+
size_t textLength = 0;
605+
hr = SaveText(fp.get(), document[L"root"], textLength);
597606
}
598607
if (callback2)
599608
callback2->Invoke({ hr, nullptr });
@@ -1111,40 +1120,89 @@ class CWebWindow
11111120
L"A",
11121121
L"ABBR",
11131122
L"ACRONYM",
1123+
L"AUDIO",
11141124
L"B",
1125+
L"BDI",
11151126
L"BDO",
11161127
L"BIG",
11171128
L"BR",
11181129
L"BUTTON",
1130+
L"CANVAS",
11191131
L"CITE",
11201132
L"CODE",
1133+
L"DATA",
1134+
L"DATALIST",
1135+
L"DEL",
11211136
L"DFN",
11221137
L"EM",
1138+
L"EMBED",
11231139
L"I",
1140+
L"IFRAME",
11241141
L"IMG",
11251142
L"INPUT",
1143+
L"INS",
11261144
L"KBD",
11271145
L"LABEL",
11281146
L"MAP",
1147+
L"MARK",
1148+
L"METER",
1149+
L"NOSCRIPT",
11291150
L"OBJECT",
1151+
L"OUTPUT",
1152+
L"PICTURE",
1153+
L"PROGRESS",
11301154
L"Q",
1155+
L"RUBY",
1156+
L"S",
11311157
L"SAMP",
11321158
L"SCRIPT",
11331159
L"SELECT",
1160+
L"SLOT",
11341161
L"SMALL",
11351162
L"SPAN",
11361163
L"STRONG",
11371164
L"SUB",
11381165
L"SUP",
1166+
L"SVG",
1167+
L"TEMPLATE",
11391168
L"TEXTAREA",
1169+
L"TIME",
11401170
L"TT",
1171+
L"U",
11411172
L"VAR",
1173+
L"VIDEO",
1174+
L"WBR",
11421175
};
11431176
return bsearch(&name, inlineElements,
11441177
sizeof(inlineElements) / sizeof(inlineElements[0]),
11451178
sizeof(inlineElements[0]), cmp);
11461179
}
11471180

1181+
static std::wstring trim_ws(const std::wstring& str)
1182+
{
1183+
if (str.empty())
1184+
return str;
1185+
1186+
std::wstring result(str);
1187+
std::wstring::iterator it = result.begin();
1188+
while (it != result.end() && *it < 0x100 && isspace(*it))
1189+
++it;
1190+
1191+
if (it != result.begin())
1192+
result.erase(result.begin(), it);
1193+
1194+
if (result.empty())
1195+
return result;
1196+
1197+
it = result.end() - 1;
1198+
while (it != result.begin() && *it < 0x100 && iswspace(*it))
1199+
--it;
1200+
1201+
if (it != result.end() - 1)
1202+
result.erase(it + 1, result.end());
1203+
return result;
1204+
}
1205+
11481206
static std::wstring Escape(const std::wstring& text)
11491207
{
11501208
std::wstring result;

0 commit comments

Comments
 (0)