From 071022dbdd40bc3315f2109d61a263da2b08ffee Mon Sep 17 00:00:00 2001 From: philipp-gecko <259382694+philipp-gecko@users.noreply.github.com> Date: Wed, 25 Feb 2026 14:57:51 +0100 Subject: [PATCH] Add SI (structured register content) detail page scraping Extends the CLI to fetch the XJustiz XML document from the Handelsregister portal, parsing company details including address, legal form, share capital, managing directors, prokura holders, business purpose, and register entries. New --detail / -det flag fetches SI data for the first search result. Both text and JSON (-j) output modes include the enriched data. Co-Authored-By: Claude Opus 4.6 --- handelsregister.py | 383 +++++++++++++++++++++++++++++++++++++++- test_handelsregister.py | 83 ++++++++- 2 files changed, 455 insertions(+), 11 deletions(-) diff --git a/handelsregister.py b/handelsregister.py index 03ccc1a..137ce9b 100755 --- a/handelsregister.py +++ b/handelsregister.py @@ -12,6 +12,7 @@ import sys from bs4 import BeautifulSoup import urllib.parse +import xml.etree.ElementTree as ET # Dictionaries to map arguments to values schlagwortOptionen = { @@ -20,6 +21,24 @@ "exact": 3 } +# XJustiz namespace +NS = {'tns': 'http://www.xjustiz.de'} + +# Role code to label mapping (from xjustiz codeliste:gds.rollenbezeichnung) +ROLE_CODES = { + '086': 'Geschäftsführer(in)', + '087': 'Vorstand', + '285': 'Prokurist(in)', + '287': 'Rechtsträger(in)', + '288': 'Registergericht', + '215': 'Einreicher(in)', + '061': 'Gesellschafter(in)', + '062': 'Kommanditist(in)', + '063': 'Persönlich haftende(r) Gesellschafter(in)', + '089': 'Liquidator(in)', + '297': 'Inhaber(in)', +} + class HandelsRegister: def __init__(self, args): self.args = args @@ -49,7 +68,7 @@ def __init__(self, args): ), ( "Connection", "keep-alive" ), ] - + self.cachedir = pathlib.Path(tempfile.gettempdir()) / "handelsregister_cache" self.cachedir.mkdir(parents=True, exist_ok=True) @@ -96,8 +115,294 @@ def search_company(self): # TODO catch the situation if there's more than one company? # TODO get all documents attached to the exact company # TODO parse useful information out of the PDFs + self._last_search_html = html return get_companies_in_searchresults(html) + def fetch_company_detail(self, result_index=0): + """Fetch the SI (Strukturierter Registerinhalt) document for a search result. + + Must be called after search_company(). Uses the cached search HTML to find + the SI link's PrimeFaces submit params, then submits the form to fetch the + XML document. + + Returns a dict of parsed company detail fields, or None if SI is unavailable. + """ + html = self._last_search_html + soup = BeautifulSoup(html, 'html.parser') + + # Find the SI link for the given result index + si_link = None + for a in soup.find_all('a'): + link_id = a.get('id', '') + # Match pattern: ergebnissForm:selectedSuchErgebnisFormTable:{index}:...:fade_ + if ':{}:'.format(result_index) in link_id: + span = a.find('span') + if span and span.text.strip() == 'SI': + si_link = a + break + + if not si_link: + return None + + onclick = si_link.get('onclick', '') + if not onclick: + return None + + # Parse PrimeFaces.addSubmitParam params from onclick + pairs = re.findall(r"'([^']+)':'([^']*)'", onclick) + if not pairs: + return None + + # Select the ergebnissForm and inject hidden params + self.browser.select_form(name='ergebnissForm') + for key, value in pairs: + self.browser.form.new_control('hidden', key, {'value': value}) + self.browser.form.fixup() + + response_si = self.browser.submit() + si_xml = response_si.read().decode('utf-8') + + # Cache the SI XML + cachename = self.companyname2cachename(self.args.schlagwoerter + '_SI') + with open(cachename, 'w') as f: + f.write(si_xml) + + return parse_si_detail(si_xml) + + +def _build_comment_map(xml_str): + """Build a map from XML element code values to their preceding comments. + + XJustiz uses XML comments to provide human-readable labels for coded values, e.g.: + 221110 + ElementTree drops comments during parsing, so we extract them from the raw string. + """ + comment_map = {} + for match in re.finditer(r'\s*([^<]+)', xml_str): + comment_map[match.group(2)] = match.group(1) + return comment_map + + +def parse_si_detail(xml_str): + """Parse the SI (Strukturierter Registerinhalt) XJustiz XML into a dict. + + The SI document is an XJustiz XML file containing structured register content: + company name, address, legal form, capital, directors, prokura holders, etc. + """ + root = ET.fromstring(xml_str) + comment_map = _build_comment_map(xml_str) + detail = {} + + # Build a role-number-to-person/org mapping from beteiligung entries + roles = {} # rollennummer -> {role_code, role_label, person_data} + for beteiligung in root.findall('.//tns:beteiligung', NS): + rolle_elems = beteiligung.findall('tns:rolle', NS) + beteiligter = beteiligung.find('tns:beteiligter', NS) + if beteiligter is None: + continue + + for rolle in rolle_elems: + rollennummer = _text(rolle, 'tns:rollennummer') + role_code_elem = rolle.find('tns:rollenbezeichnung', NS) + role_code = _text(role_code_elem, 'code') if role_code_elem is not None else None + role_label = ROLE_CODES.get(role_code, comment_map.get(role_code, role_code)) + + person_data = _parse_beteiligter(beteiligter, comment_map) + if rollennummer: + roles[rollennummer] = { + 'role_code': role_code, + 'role_label': role_label, + **person_data + } + + # Company info from the Rechtsträger (role code 287) + for rnum, rdata in roles.items(): + if rdata.get('role_code') == '287': + detail['name'] = rdata.get('name') + detail['legal_form'] = rdata.get('legal_form') + detail['legal_form_code'] = rdata.get('legal_form_code') + detail['seat'] = rdata.get('seat') + if rdata.get('address'): + detail['address'] = rdata['address'] + break + + # basisdatenRegister + basisdaten = root.find('.//tns:basisdatenRegister', NS) + if basisdaten is not None: + # Satzungsdatum + satzung = basisdaten.find('.//tns:aktuellesSatzungsdatum', NS) + if satzung is not None and satzung.text: + detail['articles_of_association_date'] = satzung.text + + # Gegenstand (business purpose) + gegenstand = basisdaten.find('.//tns:gegenstand', NS) + if gegenstand is not None and gegenstand.text: + detail['business_purpose'] = gegenstand.text.strip() + + # Vertretungsregelung (representation rules) + allg_vertretung = basisdaten.find('.//tns:allgemeineVertretungsregelung', NS) + if allg_vertretung is not None: + vb = allg_vertretung.find('.//tns:vertretungsbefugnis', NS) + if vb is not None: + vb_code = _text(vb, 'code') + if vb_code and vb_code in comment_map: + detail['representation_rules'] = comment_map[vb_code] + + # Representatives with their specific rules + for vb_elem in basisdaten.findall('.//tns:vertretungsberechtigte', NS): + ref = _text(vb_elem, 'tns:ref.rollennummer') + if ref and ref in roles: + rep = roles[ref] + besondere = vb_elem.find('tns:besondereVertretungsregelung', NS) + if besondere is not None: + freitext = besondere.find('.//tns:vertretungsbefugnisFreitext', NS) + if freitext is not None and freitext.text: + rep['representation'] = freitext.text.strip().rstrip(';') + else: + vb_code_elem = besondere.find('.//tns:vertretungsbefugnis', NS) + if vb_code_elem is not None: + code_val = _text(vb_code_elem, 'code') + if code_val and code_val in comment_map: + rep['representation'] = comment_map[code_val] + + befreiung = besondere.find('.//tns:befreiungVon181BGB', NS) + if befreiung is not None: + rep['exempt_181_bgb'] = True + + # Collect managing directors and prokura holders + directors = [] + prokura = [] + for rnum, rdata in roles.items(): + if rdata.get('role_code') == '086': # Geschäftsführer + directors.append(rdata) + elif rdata.get('role_code') == '087': # Vorstand + directors.append(rdata) + elif rdata.get('role_code') == '285': # Prokurist + prokura.append(rdata) + + if directors: + detail['directors'] = directors + if prokura: + detail['prokura'] = prokura + + # Capital (Stammkapital / Grundkapital) + stammkapital = root.find('.//tns:stammkapital', NS) + if stammkapital is None: + stammkapital = root.find('.//tns:grundkapital', NS) + if stammkapital is not None: + zahl = _text(stammkapital, 'tns:zahl') + waehrung_elem = stammkapital.find('.//tns:waehrung', NS) + waehrung_code = _text(waehrung_elem, 'code') if waehrung_elem is not None else None + if zahl: + detail['capital'] = { + 'amount': zahl, + 'currency': waehrung_code or 'EUR' + } + + # Register info + aktenzeichen = root.find('.//tns:aktenzeichen.strukturiert', NS) + if aktenzeichen is not None: + register = aktenzeichen.find('tns:register', NS) + nummer = _text(aktenzeichen, 'tns:laufendeNummer') + if register is not None and nummer: + reg_code = _text(register, 'code') + detail['register_type'] = reg_code + detail['register_number'] = nummer + + # Auszug metadata + auszug = root.find('.//tns:auszug', NS) + if auszug is not None: + detail['retrieval_date'] = _text(auszug, 'tns:abrufdatum') + detail['last_entry_date'] = _text(auszug, 'tns:letzteEintragung') + detail['num_entries'] = _text(auszug, 'tns:anzahlEintragungen') + + # Eintragungstexte (register entry texts) + entries = [] + for et_elem in root.findall('.//tns:eintragungstext', NS): + entry = {} + entry['column'] = _text(et_elem, 'tns:spalte') + entry['position'] = _text(et_elem, 'tns:position') + entry['number'] = _text(et_elem, 'tns:laufendeNummer') + entry['text'] = _text(et_elem, 'tns:text') + art_elem = et_elem.find('tns:eintragungsart', NS) + if art_elem is not None: + art_code = _text(art_elem, 'code') + if art_code and art_code in comment_map: + entry['type'] = comment_map[art_code] + entries.append(entry) + if entries: + detail['register_entries'] = entries + + return detail + + +def _text(parent, path): + """Extract text from an XML element, or None.""" + if parent is None: + return None + elem = parent.find(path, NS) + if elem is not None and elem.text: + return elem.text.strip() + return None + + +def _parse_beteiligter(beteiligter, comment_map): + """Parse a beteiligter element into a person/org dict.""" + data = {} + + # Natural person + person = beteiligter.find('.//tns:natuerlichePerson', NS) + if person is not None: + vorname = _text(person, './/tns:vorname') + nachname = _text(person, './/tns:nachname') + if vorname and nachname: + data['name'] = '%s %s' % (vorname, nachname) + elif nachname: + data['name'] = nachname + + geburtsdatum = _text(person, './/tns:geburtsdatum') + if geburtsdatum: + data['date_of_birth'] = geburtsdatum + + anschrift = person.find('tns:anschrift', NS) + if anschrift is not None: + data['city'] = _text(anschrift, 'tns:ort') + + return data + + # Organisation + org = beteiligter.find('.//tns:organisation', NS) + if org is not None: + data['name'] = _text(org, './/tns:bezeichnung.aktuell') + + rechtsform = org.find('.//tns:rechtsform', NS) + if rechtsform is not None: + rf_code = _text(rechtsform, 'code') + if rf_code: + data['legal_form'] = comment_map.get(rf_code, rf_code) + data['legal_form_code'] = rf_code + + sitz = org.find('tns:sitz', NS) + if sitz is not None: + data['seat'] = _text(sitz, 'tns:ort') + + anschrift = org.find('tns:anschrift', NS) + if anschrift is not None: + addr = {} + street = _text(anschrift, 'tns:strasse') + hausnummer = _text(anschrift, 'tns:hausnummer') + if street: + addr['street'] = street + (' ' + hausnummer if hausnummer else '') + plz = _text(anschrift, 'tns:postleitzahl') + if plz: + addr['postal_code'] = plz + ort = _text(anschrift, 'tns:ort') + if ort: + addr['city'] = ort + if addr: + data['address'] = addr + + return data def parse_result(result): @@ -106,7 +411,7 @@ def parse_result(result): cells.append(cell.text.strip()) d = {} d['court'] = cells[1] - + # Extract register number: HRB, HRA, VR, GnR followed by numbers (e.g. HRB 12345, VR 6789) # Also capture suffix letter if present (e.g. HRB 12345 B), but avoid matching start of words (e.g. " Formerly") reg_match = re.search(r'(HRA|HRB|GnR|VR|PR)\s*\d+(\s+[A-Z])?(?!\w)', d['court']) @@ -147,10 +452,65 @@ def pr_company_info(c): for name, loc in c.get('history'): print(name, loc) +def pr_company_detail(detail): + """Print the SI detail fields in human-readable format.""" + if not detail: + print(' (no detail data available)') + return + + print() + print('--- Detail (SI) ---') + for key in ('name', 'legal_form', 'seat', 'business_purpose', + 'articles_of_association_date', 'representation_rules'): + if key in detail: + label = key.replace('_', ' ').title() + print('%s: %s' % (label, detail[key])) + + if 'address' in detail: + addr = detail['address'] + parts = [] + if 'street' in addr: + parts.append(addr['street']) + if 'postal_code' in addr and 'city' in addr: + parts.append('%s %s' % (addr['postal_code'], addr['city'])) + elif 'city' in addr: + parts.append(addr['city']) + print('Address: %s' % ', '.join(parts)) + + if 'capital' in detail: + cap = detail['capital'] + print('Capital: %s %s' % (cap['amount'], cap['currency'])) + + if 'directors' in detail: + print('Directors:') + for d in detail['directors']: + extra = '' + if d.get('representation'): + extra = ' (%s)' % d['representation'] + print(' %s%s' % (d.get('name', '?'), extra)) + + if 'prokura' in detail: + print('Prokura:') + for p in detail['prokura']: + extra = '' + if p.get('representation'): + extra = ' (%s)' % p['representation'] + print(' %s%s' % (p.get('name', '?'), extra)) + + if 'register_entries' in detail: + print('Register Entries:') + for entry in detail['register_entries']: + etype = entry.get('type', '') + text = entry.get('text', '') + print(' [%s] %s' % (etype, text)) + + print('Last Entry: %s' % detail.get('last_entry_date', '-')) + print('Retrieval Date: %s' % detail.get('retrieval_date', '-')) + def get_companies_in_searchresults(html): soup = BeautifulSoup(html, 'html.parser') grid = soup.find('table', role='grid') - + results = [] for result in grid.find_all('tr'): a = result.get('data-ri') @@ -195,6 +555,12 @@ def parse_args(): help="Return response as JSON", action="store_true" ) + parser.add_argument( + "-det", + "--detail", + help="Fetch SI (structured register content) detail for the first search result", + action="store_true" + ) args = parser.parse_args() @@ -214,8 +580,17 @@ def parse_args(): h.open_startpage() companies = h.search_company() if companies is not None: + detail = None + if args.detail and len(companies) > 0: + detail = h.fetch_company_detail(result_index=0) + if args.json: - print(json.dumps(companies)) + output = companies + if detail: + output[0]['detail'] = detail + print(json.dumps(output)) else: for c in companies: pr_company_info(c) + if detail: + pr_company_detail(detail) diff --git a/test_handelsregister.py b/test_handelsregister.py index fa1951a..9b6a717 100644 --- a/test_handelsregister.py +++ b/test_handelsregister.py @@ -1,12 +1,12 @@ import pytest -from handelsregister import get_companies_in_searchresults,HandelsRegister +from handelsregister import get_companies_in_searchresults,HandelsRegister,parse_si_detail import argparse def test_parse_search_result(): html = '%s' % """
Berlin District court Berlin (Charlottenburg) HRB 44343
GASAG AGBerlincurrently registered
History
1.) Gasag Berliner Gaswerke Aktiengesellschaft1.) Berlin
""" res = get_companies_in_searchresults(html) assert res == [{ - 'court':'Berlin District court Berlin (Charlottenburg) HRB 44343', + 'court':'Berlin District court Berlin (Charlottenburg) HRB 44343', 'register_num': 'HRB 44343 B', 'name':'GASAG AG', 'state':'Berlin', @@ -17,6 +17,75 @@ def test_parse_search_result(): },] +def test_parse_si_detail(): + """Test parsing of SI (Strukturierter Registerinhalt) XJustiz XML.""" + si_xml = """unbekannt2026-02-25T11:21:36ZU1308test-id002RegisSTARTest1.00U1308HRB32007Strukturierter Registerinhalt114536528302871Test Company GmbH221110Leipzig003Teststraße4204109Leipzig000414533251500864MaxMustermann1984-06-25017Leipzig514543556802855ErikaMusterfrau1990-01-15017Berlin0021612009Gesellschaftsvertrag vom 18.06.2015.11:21:362026-02-252025-06-0272025-05-222015-06-18Test Company GmbH221110Leipzig003Teststraße4204109Leipzig0000664einzelvertretungsberechtigt;0115002Softwareentwicklung und IT-Beratung.25000.00EUR""" + + detail = parse_si_detail(si_xml) + + # Company info + assert detail['name'] == 'Test Company GmbH' + assert 'GmbH' in detail['legal_form'] + assert detail['seat'] == 'Leipzig' + + # Address + assert detail['address']['street'] == 'Teststraße 42' + assert detail['address']['postal_code'] == '04109' + assert detail['address']['city'] == 'Leipzig' + + # Business purpose + assert 'Softwareentwicklung' in detail['business_purpose'] + + # Capital + assert detail['capital']['amount'] == '25000.00' + assert detail['capital']['currency'] == 'EUR' + + # Directors + assert len(detail['directors']) == 1 + assert detail['directors'][0]['name'] == 'Max Mustermann' + assert detail['directors'][0]['date_of_birth'] == '1984-06-25' + + # Prokura + assert len(detail['prokura']) == 1 + assert detail['prokura'][0]['name'] == 'Erika Musterfrau' + + # Register info + assert detail['register_type'] == 'HRB' + assert detail['register_number'] == '32007' + assert detail['retrieval_date'] == '2026-02-25' + assert detail['last_entry_date'] == '2025-06-02' + + # Articles of association + assert detail['articles_of_association_date'] == '2015-06-18' + + # Register entries + assert len(detail['register_entries']) >= 1 + assert 'Gesellschaftsvertrag' in detail['register_entries'][0]['text'] + + +def test_fetch_detail(): + """Integration test: search for a known company and fetch its SI data.""" + args = argparse.Namespace(debug=False, force=True, schlagwoerter='Gecko Two GmbH', + schlagwortOptionen='exact', json=False, detail=True) + h = HandelsRegister(args) + h.open_startpage() + companies = h.search_company() + assert companies is not None + assert len(companies) > 0 + + detail = h.fetch_company_detail(result_index=0) + assert detail is not None + + # Basic fields that should always be present + assert detail.get('name') is not None + assert 'Gecko' in detail['name'] + assert detail.get('seat') is not None + assert detail.get('address') is not None + assert detail.get('capital') is not None + assert detail.get('directors') is not None + assert len(detail['directors']) > 0 + + @pytest.mark.parametrize("company, state_id", [ ("Hafen Hamburg", "Hamburg"), ("Bayerische Motoren Werke", "Bayern"), @@ -37,7 +106,7 @@ def test_parse_search_result(): ]) def test_search_by_state_company(company, state_id): - args = argparse.Namespace(debug=False, force=True, schlagwoerter=company, schlagwortOptionen='all', json=False) + args = argparse.Namespace(debug=False, force=True, schlagwoerter=company, schlagwortOptionen='all', json=False, detail=False) h = HandelsRegister(args) h.open_startpage() companies = h.search_company() @@ -45,13 +114,13 @@ def test_search_by_state_company(company, state_id): assert len(companies) > 0 def test_haus_anker_b_suffix(): - args = argparse.Namespace(debug=False, force=True, schlagwoerter='Haus-Anker Verwaltungs GmbH', schlagwortOptionen='exact', json=False) + args = argparse.Namespace(debug=False, force=True, schlagwoerter='Haus-Anker Verwaltungs GmbH', schlagwortOptionen='exact', json=False, detail=False) h = HandelsRegister(args) h.open_startpage() companies = h.search_company() assert companies is not None - + target_company = next((c for c in companies if '138434' in c['register_num']), None) - + assert target_company is not None, "Haus-Anker Verwaltungs GmbH with expected number not found" - assert target_company['register_num'] == 'HRB 138434 B' \ No newline at end of file + assert target_company['register_num'] == 'HRB 138434 B'