v1.6.0: Language Filter
This commit is contained in:
2
dist/bookmarklet-payload.js
vendored
2
dist/bookmarklet-payload.js
vendored
File diff suppressed because one or more lines are too long
2
dist/bookmarklet.txt
vendored
2
dist/bookmarklet.txt
vendored
File diff suppressed because one or more lines are too long
2
dist/install.html
vendored
2
dist/install.html
vendored
File diff suppressed because one or more lines are too long
125
dist/kantine-standalone.html
vendored
125
dist/kantine-standalone.html
vendored
@@ -4342,13 +4342,32 @@ body {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// === Language Filter (FR-100) ===
|
// === Language Filter (FR-100) ===
|
||||||
// DE keywords for fallback language detection
|
// DE stems for fallback language detection
|
||||||
const DE_KEYWORDS = ['mit', 'und', 'oder', 'vom', 'dazu', 'auf', 'nach', 'ein', 'eine', 'der', 'die', 'das', 'aus', 'in', 'an', 'für',
|
const DE_STEMS = [
|
||||||
'suppe', 'salat', 'gemüse', 'reis', 'nudeln', 'kartoffel', 'fleisch', 'soße', 'sauce', 'brot', 'joghurt',
|
'mit', 'und', 'oder', 'für', 'vom', 'zum', 'zur', 'gebraten', 'kartoffel', 'gemüse', 'suppe',
|
||||||
'gebraten', 'gekocht', 'gegrillt', 'überbacken', 'gefüllt', 'frisch', 'hausgemacht'];
|
'kuchen', 'schwein', 'rind', 'hähnchen', 'huhn', 'fisch', 'nudel', 'soße', 'sosse', 'wurst',
|
||||||
const EN_KEYWORDS = ['with', 'and', 'or', 'from', 'served', 'on', 'in', 'a', 'the', 'of', 'for',
|
'kürbis', 'braten', 'sahne', 'apfel', 'käse', 'fleisch', 'pilz', 'kirsch', 'joghurt', 'spätzle',
|
||||||
'soup', 'salad', 'vegetables', 'rice', 'pasta', 'potato', 'meat', 'sauce', 'bread', 'yogurt',
|
'knödel', 'kraut', 'schnitzel', 'püree', 'rahm', 'erdbeer', 'schoko', 'vanille', 'tomate',
|
||||||
'fried', 'cooked', 'grilled', 'baked', 'stuffed', 'fresh', 'homemade'];
|
'gurke', 'salat', 'zwiebel', 'paprika', 'reis', 'bohne', 'erbse', 'karotte', 'möhre', 'lauch',
|
||||||
|
'knoblauch', 'chili', 'gewürz', 'kräuter', 'pfeffer', 'salz', 'butter', 'milch', 'eier',
|
||||||
|
'pfanne', 'auflauf', 'gratin', 'ragout', 'gulasch', 'eintopf', 'filet', 'steak', 'brust',
|
||||||
|
'salami', 'schinken', 'speck', 'brokkoli', 'blumenkohl', 'zucchini', 'aubergine',
|
||||||
|
'spinat', 'spargel', 'olive', 'mandel', 'nuss', 'honig', 'senf', 'essig', 'öl', 'brot',
|
||||||
|
'brötchen', 'pfannkuchen', 'eis', 'torte', 'dessert', 'kompott', 'obst', 'frucht', 'beere',
|
||||||
|
'plunder', 'dip', 'tofu', 'jambalaya'
|
||||||
|
];
|
||||||
|
const EN_STEMS = [
|
||||||
|
'with', 'and', 'or', 'for', 'from', 'to', 'fried', 'potato', 'vegetable', 'soup', 'cake',
|
||||||
|
'pork', 'beef', 'chicken', 'fish', 'noodle', 'sauce', 'sausage', 'pumpkin', 'roast',
|
||||||
|
'cream', 'apple', 'cheese', 'meat', 'mushroom', 'cherry', 'yogurt', 'wedge', 'sweet',
|
||||||
|
'sour', 'dumpling', 'cabbage', 'mash', 'strawberr', 'choco', 'vanilla', 'tomat', 'cucumber',
|
||||||
|
'salad', 'onion', 'pepper', 'rice', 'bean', 'pea', 'carrot', 'leek', 'garlic', 'chili',
|
||||||
|
'spice', 'herb', 'salt', 'butter', 'milk', 'egg', 'pan', 'casserole', 'gratin', 'ragout',
|
||||||
|
'goulash', 'stew', 'filet', 'steak', 'breast', 'salami', 'ham', 'bacon', 'broccoli',
|
||||||
|
'cauliflower', 'zucchini', 'eggplant', 'spinach', 'asparagus', 'olive', 'almond', 'nut',
|
||||||
|
'honey', 'mustard', 'vinegar', 'oil', 'bread', 'bun', 'pancake', 'ice', 'tart', 'dessert',
|
||||||
|
'compote', 'fruit', 'berry', 'dip', 'danish', 'tofu', 'jambalaya'
|
||||||
|
];
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Splits bilingual menu text into DE and EN parts.
|
* Splits bilingual menu text into DE and EN parts.
|
||||||
@@ -4363,18 +4382,86 @@ body {
|
|||||||
const raw = text;
|
const raw = text;
|
||||||
const formattedRaw = '• ' + text.replace(/\(([A-Z ]+)\)\s*(?=\S)/g, '($1)\n• ');
|
const formattedRaw = '• ' + text.replace(/\(([A-Z ]+)\)\s*(?=\S)/g, '($1)\n• ');
|
||||||
|
|
||||||
|
// Utility to compute DE/EN score for a subset of words
|
||||||
|
function scoreBlock(wordArray) {
|
||||||
|
let de = 0, en = 0;
|
||||||
|
wordArray.forEach(word => {
|
||||||
|
const w = word.toLowerCase().replace(/[^a-zäöüß]/g, '');
|
||||||
|
if (w) {
|
||||||
|
let bestDeMatch = 0;
|
||||||
|
let bestEnMatch = 0;
|
||||||
|
// Full match is better than partial string match
|
||||||
|
if (DE_STEMS.includes(w)) bestDeMatch = w.length;
|
||||||
|
else DE_STEMS.forEach(s => { if (w.includes(s) && s.length > bestDeMatch) bestDeMatch = s.length; });
|
||||||
|
|
||||||
|
if (EN_STEMS.includes(w)) bestEnMatch = w.length;
|
||||||
|
else EN_STEMS.forEach(s => { if (w.includes(s) && s.length > bestEnMatch) bestEnMatch = s.length; });
|
||||||
|
|
||||||
|
if (bestDeMatch > 0) de += (bestDeMatch / w.length);
|
||||||
|
if (bestEnMatch > 0) en += (bestEnMatch / w.length);
|
||||||
|
|
||||||
|
// Capitalized noun heuristic matches German text styles typically
|
||||||
|
if (/^[A-ZÄÖÜ]/.test(word)) {
|
||||||
|
de += 0.5;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return { de, en };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Heuristic sliding window to split a fragment containing "EN DE"
|
||||||
|
// E.g., "Bratwurst with pumpkin Kirschjoghurt" => enPart: "Bratwurst with pumpkin", dePart: "Kirschjoghurt"
|
||||||
|
function heuristicSplitEnDe(fragment) {
|
||||||
|
const words = fragment.trim().split(/\s+/);
|
||||||
|
if (words.length < 2) return { enPart: fragment, nextDe: '' };
|
||||||
|
|
||||||
|
let bestK = -1;
|
||||||
|
let maxScore = -9999;
|
||||||
|
|
||||||
|
for (let k = 1; k < words.length; k++) {
|
||||||
|
const left = words.slice(0, k);
|
||||||
|
const right = words.slice(k);
|
||||||
|
|
||||||
|
const leftScore = scoreBlock(left);
|
||||||
|
const rightScore = scoreBlock(right);
|
||||||
|
|
||||||
|
// left should be EN, right should be DE
|
||||||
|
// Metric = (EN votes in left - DE votes in left) + (DE votes in right - EN votes in right)
|
||||||
|
const score = (leftScore.en - leftScore.de) + (rightScore.de - rightScore.en);
|
||||||
|
|
||||||
|
// Extra penalty if the split puts a low-case word as the first word of the right (DE) part
|
||||||
|
// because a new German sentence usually starts with a capital noun.
|
||||||
|
const rightFirstWord = right[0];
|
||||||
|
let capitalBonus = 0;
|
||||||
|
if (/^[A-ZÄÖÜ]/.test(rightFirstWord)) {
|
||||||
|
capitalBonus = 2.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
const finalScore = score + capitalBonus;
|
||||||
|
|
||||||
|
if (finalScore > maxScore) {
|
||||||
|
maxScore = finalScore;
|
||||||
|
bestK = k;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (bestK !== -1) {
|
||||||
|
return {
|
||||||
|
enPart: words.slice(0, bestK).join(' '),
|
||||||
|
nextDe: words.slice(bestK).join(' ')
|
||||||
|
};
|
||||||
|
}
|
||||||
|
return { enPart: fragment, nextDe: '' };
|
||||||
|
}
|
||||||
|
|
||||||
// Check if text contains the bilingual separator ' / '
|
// Check if text contains the bilingual separator ' / '
|
||||||
if (!text.includes(' / ')) {
|
if (!text.includes(' / ')) {
|
||||||
// Fallback: detect language via keyword scoring
|
// Fallback: detect language via keyword scoring
|
||||||
const words = text.toLowerCase().split(/\s+/);
|
const words = text.toLowerCase().split(/\s+/);
|
||||||
let deScore = 0, enScore = 0;
|
const score = scoreBlock(words);
|
||||||
words.forEach(w => {
|
|
||||||
const clean = w.replace(/[^a-zäöüß]/g, '');
|
|
||||||
if (DE_KEYWORDS.includes(clean)) deScore++;
|
|
||||||
if (EN_KEYWORDS.includes(clean)) enScore++;
|
|
||||||
});
|
|
||||||
// No split possible – return full text for detected language, empty for other
|
// No split possible – return full text for detected language, empty for other
|
||||||
if (enScore > deScore) {
|
if (score.en > score.de) {
|
||||||
return { de: '', en: formattedRaw, raw: formattedRaw };
|
return { de: '', en: formattedRaw, raw: formattedRaw };
|
||||||
}
|
}
|
||||||
return { de: formattedRaw, en: '', raw: formattedRaw };
|
return { de: formattedRaw, en: '', raw: formattedRaw };
|
||||||
@@ -4419,8 +4506,14 @@ body {
|
|||||||
deParts.push(nextDe);
|
deParts.push(nextDe);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// No allergen code – this is the last EN part
|
// No allergen code found!
|
||||||
enParts.push(fragment);
|
// If it's not the last part (or even if it is, but we highly suspect merged languages),
|
||||||
|
// we use the heuristic to find the hidden split-point.
|
||||||
|
const split = heuristicSplitEnDe(fragment);
|
||||||
|
enParts.push(split.enPart);
|
||||||
|
if (split.nextDe) {
|
||||||
|
deParts.push(split.nextDe);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
125
kantine.js
125
kantine.js
@@ -2359,13 +2359,32 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
// === Language Filter (FR-100) ===
|
// === Language Filter (FR-100) ===
|
||||||
// DE keywords for fallback language detection
|
// DE stems for fallback language detection
|
||||||
const DE_KEYWORDS = ['mit', 'und', 'oder', 'vom', 'dazu', 'auf', 'nach', 'ein', 'eine', 'der', 'die', 'das', 'aus', 'in', 'an', 'für',
|
const DE_STEMS = [
|
||||||
'suppe', 'salat', 'gemüse', 'reis', 'nudeln', 'kartoffel', 'fleisch', 'soße', 'sauce', 'brot', 'joghurt',
|
'mit', 'und', 'oder', 'für', 'vom', 'zum', 'zur', 'gebraten', 'kartoffel', 'gemüse', 'suppe',
|
||||||
'gebraten', 'gekocht', 'gegrillt', 'überbacken', 'gefüllt', 'frisch', 'hausgemacht'];
|
'kuchen', 'schwein', 'rind', 'hähnchen', 'huhn', 'fisch', 'nudel', 'soße', 'sosse', 'wurst',
|
||||||
const EN_KEYWORDS = ['with', 'and', 'or', 'from', 'served', 'on', 'in', 'a', 'the', 'of', 'for',
|
'kürbis', 'braten', 'sahne', 'apfel', 'käse', 'fleisch', 'pilz', 'kirsch', 'joghurt', 'spätzle',
|
||||||
'soup', 'salad', 'vegetables', 'rice', 'pasta', 'potato', 'meat', 'sauce', 'bread', 'yogurt',
|
'knödel', 'kraut', 'schnitzel', 'püree', 'rahm', 'erdbeer', 'schoko', 'vanille', 'tomate',
|
||||||
'fried', 'cooked', 'grilled', 'baked', 'stuffed', 'fresh', 'homemade'];
|
'gurke', 'salat', 'zwiebel', 'paprika', 'reis', 'bohne', 'erbse', 'karotte', 'möhre', 'lauch',
|
||||||
|
'knoblauch', 'chili', 'gewürz', 'kräuter', 'pfeffer', 'salz', 'butter', 'milch', 'eier',
|
||||||
|
'pfanne', 'auflauf', 'gratin', 'ragout', 'gulasch', 'eintopf', 'filet', 'steak', 'brust',
|
||||||
|
'salami', 'schinken', 'speck', 'brokkoli', 'blumenkohl', 'zucchini', 'aubergine',
|
||||||
|
'spinat', 'spargel', 'olive', 'mandel', 'nuss', 'honig', 'senf', 'essig', 'öl', 'brot',
|
||||||
|
'brötchen', 'pfannkuchen', 'eis', 'torte', 'dessert', 'kompott', 'obst', 'frucht', 'beere',
|
||||||
|
'plunder', 'dip', 'tofu', 'jambalaya'
|
||||||
|
];
|
||||||
|
const EN_STEMS = [
|
||||||
|
'with', 'and', 'or', 'for', 'from', 'to', 'fried', 'potato', 'vegetable', 'soup', 'cake',
|
||||||
|
'pork', 'beef', 'chicken', 'fish', 'noodle', 'sauce', 'sausage', 'pumpkin', 'roast',
|
||||||
|
'cream', 'apple', 'cheese', 'meat', 'mushroom', 'cherry', 'yogurt', 'wedge', 'sweet',
|
||||||
|
'sour', 'dumpling', 'cabbage', 'mash', 'strawberr', 'choco', 'vanilla', 'tomat', 'cucumber',
|
||||||
|
'salad', 'onion', 'pepper', 'rice', 'bean', 'pea', 'carrot', 'leek', 'garlic', 'chili',
|
||||||
|
'spice', 'herb', 'salt', 'butter', 'milk', 'egg', 'pan', 'casserole', 'gratin', 'ragout',
|
||||||
|
'goulash', 'stew', 'filet', 'steak', 'breast', 'salami', 'ham', 'bacon', 'broccoli',
|
||||||
|
'cauliflower', 'zucchini', 'eggplant', 'spinach', 'asparagus', 'olive', 'almond', 'nut',
|
||||||
|
'honey', 'mustard', 'vinegar', 'oil', 'bread', 'bun', 'pancake', 'ice', 'tart', 'dessert',
|
||||||
|
'compote', 'fruit', 'berry', 'dip', 'danish', 'tofu', 'jambalaya'
|
||||||
|
];
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Splits bilingual menu text into DE and EN parts.
|
* Splits bilingual menu text into DE and EN parts.
|
||||||
@@ -2380,18 +2399,86 @@
|
|||||||
const raw = text;
|
const raw = text;
|
||||||
const formattedRaw = '• ' + text.replace(/\(([A-Z ]+)\)\s*(?=\S)/g, '($1)\n• ');
|
const formattedRaw = '• ' + text.replace(/\(([A-Z ]+)\)\s*(?=\S)/g, '($1)\n• ');
|
||||||
|
|
||||||
|
// Utility to compute DE/EN score for a subset of words
|
||||||
|
function scoreBlock(wordArray) {
|
||||||
|
let de = 0, en = 0;
|
||||||
|
wordArray.forEach(word => {
|
||||||
|
const w = word.toLowerCase().replace(/[^a-zäöüß]/g, '');
|
||||||
|
if (w) {
|
||||||
|
let bestDeMatch = 0;
|
||||||
|
let bestEnMatch = 0;
|
||||||
|
// Full match is better than partial string match
|
||||||
|
if (DE_STEMS.includes(w)) bestDeMatch = w.length;
|
||||||
|
else DE_STEMS.forEach(s => { if (w.includes(s) && s.length > bestDeMatch) bestDeMatch = s.length; });
|
||||||
|
|
||||||
|
if (EN_STEMS.includes(w)) bestEnMatch = w.length;
|
||||||
|
else EN_STEMS.forEach(s => { if (w.includes(s) && s.length > bestEnMatch) bestEnMatch = s.length; });
|
||||||
|
|
||||||
|
if (bestDeMatch > 0) de += (bestDeMatch / w.length);
|
||||||
|
if (bestEnMatch > 0) en += (bestEnMatch / w.length);
|
||||||
|
|
||||||
|
// Capitalized noun heuristic matches German text styles typically
|
||||||
|
if (/^[A-ZÄÖÜ]/.test(word)) {
|
||||||
|
de += 0.5;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return { de, en };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Heuristic sliding window to split a fragment containing "EN DE"
|
||||||
|
// E.g., "Bratwurst with pumpkin Kirschjoghurt" => enPart: "Bratwurst with pumpkin", dePart: "Kirschjoghurt"
|
||||||
|
function heuristicSplitEnDe(fragment) {
|
||||||
|
const words = fragment.trim().split(/\s+/);
|
||||||
|
if (words.length < 2) return { enPart: fragment, nextDe: '' };
|
||||||
|
|
||||||
|
let bestK = -1;
|
||||||
|
let maxScore = -9999;
|
||||||
|
|
||||||
|
for (let k = 1; k < words.length; k++) {
|
||||||
|
const left = words.slice(0, k);
|
||||||
|
const right = words.slice(k);
|
||||||
|
|
||||||
|
const leftScore = scoreBlock(left);
|
||||||
|
const rightScore = scoreBlock(right);
|
||||||
|
|
||||||
|
// left should be EN, right should be DE
|
||||||
|
// Metric = (EN votes in left - DE votes in left) + (DE votes in right - EN votes in right)
|
||||||
|
const score = (leftScore.en - leftScore.de) + (rightScore.de - rightScore.en);
|
||||||
|
|
||||||
|
// Extra penalty if the split puts a low-case word as the first word of the right (DE) part
|
||||||
|
// because a new German sentence usually starts with a capital noun.
|
||||||
|
const rightFirstWord = right[0];
|
||||||
|
let capitalBonus = 0;
|
||||||
|
if (/^[A-ZÄÖÜ]/.test(rightFirstWord)) {
|
||||||
|
capitalBonus = 2.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
const finalScore = score + capitalBonus;
|
||||||
|
|
||||||
|
if (finalScore > maxScore) {
|
||||||
|
maxScore = finalScore;
|
||||||
|
bestK = k;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (bestK !== -1) {
|
||||||
|
return {
|
||||||
|
enPart: words.slice(0, bestK).join(' '),
|
||||||
|
nextDe: words.slice(bestK).join(' ')
|
||||||
|
};
|
||||||
|
}
|
||||||
|
return { enPart: fragment, nextDe: '' };
|
||||||
|
}
|
||||||
|
|
||||||
// Check if text contains the bilingual separator ' / '
|
// Check if text contains the bilingual separator ' / '
|
||||||
if (!text.includes(' / ')) {
|
if (!text.includes(' / ')) {
|
||||||
// Fallback: detect language via keyword scoring
|
// Fallback: detect language via keyword scoring
|
||||||
const words = text.toLowerCase().split(/\s+/);
|
const words = text.toLowerCase().split(/\s+/);
|
||||||
let deScore = 0, enScore = 0;
|
const score = scoreBlock(words);
|
||||||
words.forEach(w => {
|
|
||||||
const clean = w.replace(/[^a-zäöüß]/g, '');
|
|
||||||
if (DE_KEYWORDS.includes(clean)) deScore++;
|
|
||||||
if (EN_KEYWORDS.includes(clean)) enScore++;
|
|
||||||
});
|
|
||||||
// No split possible – return full text for detected language, empty for other
|
// No split possible – return full text for detected language, empty for other
|
||||||
if (enScore > deScore) {
|
if (score.en > score.de) {
|
||||||
return { de: '', en: formattedRaw, raw: formattedRaw };
|
return { de: '', en: formattedRaw, raw: formattedRaw };
|
||||||
}
|
}
|
||||||
return { de: formattedRaw, en: '', raw: formattedRaw };
|
return { de: formattedRaw, en: '', raw: formattedRaw };
|
||||||
@@ -2436,8 +2523,14 @@
|
|||||||
deParts.push(nextDe);
|
deParts.push(nextDe);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// No allergen code – this is the last EN part
|
// No allergen code found!
|
||||||
enParts.push(fragment);
|
// If it's not the last part (or even if it is, but we highly suspect merged languages),
|
||||||
|
// we use the heuristic to find the hidden split-point.
|
||||||
|
const split = heuristicSplitEnDe(fragment);
|
||||||
|
enParts.push(split.enPart);
|
||||||
|
if (split.nextDe) {
|
||||||
|
deParts.push(split.nextDe);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -62,6 +62,7 @@ const sandbox = {
|
|||||||
}
|
}
|
||||||
return createMockElement('query-result');
|
return createMockElement('query-result');
|
||||||
},
|
},
|
||||||
|
querySelectorAll: () => [createMockElement()],
|
||||||
getElementById: (id) => createMockElement(id),
|
getElementById: (id) => createMockElement(id),
|
||||||
documentElement: {
|
documentElement: {
|
||||||
setAttribute: () => { },
|
setAttribute: () => { },
|
||||||
|
|||||||
134
test_split.js
Executable file
134
test_split.js
Executable file
@@ -0,0 +1,134 @@
|
|||||||
|
const DE_STEMS = [
|
||||||
|
'mit','und','oder','für','vom','zum','zur','gebraten','kartoffel','gemüse','suppe',
|
||||||
|
'kuchen','schwein','rind','hähnchen','huhn','fisch','nudel','soße','sosse','wurst',
|
||||||
|
'kürbis','braten','sahne','apfel','käse','fleisch','pilz','kirsch','joghurt','spätzle',
|
||||||
|
'knödel','kraut','schnitzel','püree','rahm','erdbeer','schoko','vanille','tomate',
|
||||||
|
'gurke','salat','zwiebel','paprika','reis','bohne','erbse','karotte','möhre','lauch',
|
||||||
|
'knoblauch','chili','gewürz','kräuter','pfeffer','salz','butter','milch','eier',
|
||||||
|
'pfanne','auflauf','gratin','ragout','gulasch','eintopf','filet','steak','brust',
|
||||||
|
'salami','schinken','speck','brokkoli','blumenkohl','zucchini','aubergine',
|
||||||
|
'spinat','spargel','olive','mandel','nuss','honig','senf','essig','öl','brot',
|
||||||
|
'brötchen','pfannkuchen','eis','torte','dessert','kompott','obst','frucht','beere',
|
||||||
|
'plunder', 'dip'
|
||||||
|
];
|
||||||
|
const EN_STEMS = [
|
||||||
|
'with','and','or','for','from','to','fried','potato','vegetable','soup','cake',
|
||||||
|
'pork','beef','chicken','fish','noodle','sauce','sausage','pumpkin','roast',
|
||||||
|
'cream','apple','cheese','meat','mushroom','cherry','yogurt','wedge','sweet',
|
||||||
|
'sour','dumpling','cabbage','mash','strawberr','choco','vanilla','tomat','cucumber',
|
||||||
|
'salad','onion','pepper','rice','bean','pea','carrot','leek','garlic','chili',
|
||||||
|
'spice','herb','salt','butter','milk','egg','pan','casserole','gratin','ragout',
|
||||||
|
'goulash','stew','filet','steak','breast','salami','ham','bacon','broccoli',
|
||||||
|
'cauliflower','zucchini','eggplant','spinach','asparagus','olive','almond','nut',
|
||||||
|
'honey','mustard','vinegar','oil','bread','bun','pancake','ice','tart','dessert',
|
||||||
|
'compote','fruit','berry', 'dip', 'danish'
|
||||||
|
];
|
||||||
|
|
||||||
|
function splitLanguage(text) {
|
||||||
|
if (!text) return { de: '', en: '', raw: '' };
|
||||||
|
|
||||||
|
const raw = text;
|
||||||
|
const formattedRaw = '• ' + text.replace(/\(([A-Z ]+)\)\s*(?=\S)/g, '($1)\n• ');
|
||||||
|
|
||||||
|
function scoreBlock(wordArray) {
|
||||||
|
let de = 0, en = 0;
|
||||||
|
wordArray.forEach(word => {
|
||||||
|
const w = word.toLowerCase().replace(/[^a-zäöüß]/g, '');
|
||||||
|
if (w) {
|
||||||
|
DE_STEMS.forEach(s => { if (w.includes(s)) de += w.length / s.length; });
|
||||||
|
EN_STEMS.forEach(s => { if (w.includes(s)) en += w.length / s.length; });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return { de, en };
|
||||||
|
}
|
||||||
|
|
||||||
|
function heuristicSplitEnDe(fragment) {
|
||||||
|
const words = fragment.trim().split(/\s+/);
|
||||||
|
if (words.length < 2) return { enPart: fragment, nextDe: '' };
|
||||||
|
|
||||||
|
let bestK = -1;
|
||||||
|
let maxScore = -9999;
|
||||||
|
|
||||||
|
for (let k = 1; k < words.length; k++) {
|
||||||
|
const left = words.slice(0, k);
|
||||||
|
const right = words.slice(k);
|
||||||
|
|
||||||
|
const leftScore = scoreBlock(left);
|
||||||
|
const rightScore = scoreBlock(right);
|
||||||
|
|
||||||
|
// Left side is EN, right side is DE. We want EN words on left, DE words on right.
|
||||||
|
const score = (leftScore.en - leftScore.de) + (rightScore.de - rightScore.en);
|
||||||
|
|
||||||
|
if (score > maxScore) {
|
||||||
|
maxScore = score;
|
||||||
|
bestK = k;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (bestK !== -1 && maxScore > 0) {
|
||||||
|
return {
|
||||||
|
enPart: words.slice(0, bestK).join(' '),
|
||||||
|
nextDe: words.slice(bestK).join(' ')
|
||||||
|
};
|
||||||
|
}
|
||||||
|
return { enPart: fragment, nextDe: '' };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!text.includes(' / ')) {
|
||||||
|
const words = text.toLowerCase().split(/\s+/);
|
||||||
|
const score = scoreBlock(words);
|
||||||
|
if (score.en > score.de) {
|
||||||
|
return { de: '', en: formattedRaw, raw: formattedRaw };
|
||||||
|
}
|
||||||
|
return { de: formattedRaw, en: '', raw: formattedRaw };
|
||||||
|
}
|
||||||
|
|
||||||
|
const parts = text.split(' / ');
|
||||||
|
if (parts.length > 4) {
|
||||||
|
return { de: formattedRaw, en: '', raw: formattedRaw };
|
||||||
|
}
|
||||||
|
|
||||||
|
const deParts = [];
|
||||||
|
const enParts = [];
|
||||||
|
|
||||||
|
deParts.push(parts[0].trim());
|
||||||
|
|
||||||
|
const allergenRegex = /\(([A-Z ]+)\)\s*/;
|
||||||
|
|
||||||
|
for (let i = 1; i < parts.length; i++) {
|
||||||
|
const fragment = parts[i].trim();
|
||||||
|
const match = fragment.match(allergenRegex);
|
||||||
|
|
||||||
|
if (match) {
|
||||||
|
const allergenEnd = match.index + match[0].length;
|
||||||
|
const enPart = fragment.substring(0, match.index).trim();
|
||||||
|
const allergenCode = match[1];
|
||||||
|
const nextDe = fragment.substring(allergenEnd).trim();
|
||||||
|
|
||||||
|
enParts.push(enPart + '(' + allergenCode + ')');
|
||||||
|
if (deParts.length > 0) {
|
||||||
|
deParts[deParts.length - 1] = deParts[deParts.length - 1] + '(' + allergenCode + ')';
|
||||||
|
}
|
||||||
|
|
||||||
|
if (nextDe) {
|
||||||
|
deParts.push(nextDe);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
console.log("No allergen match in:", fragment);
|
||||||
|
const split = heuristicSplitEnDe(fragment);
|
||||||
|
console.log("Split:", split);
|
||||||
|
enParts.push(split.enPart);
|
||||||
|
if (split.nextDe) {
|
||||||
|
deParts.push(split.nextDe);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
de: deParts.map(p => '• ' + p).join('\n'),
|
||||||
|
en: enParts.map(p => '• ' + p).join('\n'),
|
||||||
|
raw: formattedRaw
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(splitLanguage("Hühnersuppe mit Reis / Chicken soup with rice Jambalaya mit Tofu und Joghurtdip / Jambalaya with tofu and yogurt dip Mini Plunder / Mini danishes(ACGHO)"));
|
||||||
Reference in New Issue
Block a user