diff --git a/main.py b/main.py index 9721118..808f9a9 100644 --- a/main.py +++ b/main.py @@ -35,6 +35,14 @@ DialogFilter, DialogFilterDefault, TextWithEntities, + MessageEntityBold, + MessageEntityItalic, + MessageEntityStrike, + MessageEntityCode, + MessageEntityPre, + MessageEntityTextUrl, + MessageEntityUrl, + MessageEntityMentionName, ) import re from functools import wraps @@ -287,6 +295,75 @@ def validate_single_id(value, p_name): return decorator +def _utf16_to_python_offsets(text, utf16_offset, utf16_length): + """Convert UTF-16 offset/length to Python string offset/length. + + Telegram entities use UTF-16 code units for offsets, but Python strings + use Unicode code points. Characters outside the BMP (most emoji) take + 2 UTF-16 code units but only 1 Python char, so offsets diverge. + """ + py_offset = 0 + utf16_pos = 0 + for ch in text: + if utf16_pos >= utf16_offset: + break + utf16_pos += 2 if ord(ch) > 0xFFFF else 1 + py_offset += 1 + + py_length = 0 + utf16_consumed = 0 + for ch in text[py_offset:]: + if utf16_consumed >= utf16_length: + break + utf16_consumed += 2 if ord(ch) > 0xFFFF else 1 + py_length += 1 + + return py_offset, py_length + + +def message_to_markdown(msg) -> str: + """Convert a Telethon message to markdown, preserving entities (bold, italic, links, etc.).""" + text = msg.message + if not text: + return "" + entities = msg.entities + if not entities: + return text + + # Build list of (offset, length, prefix, suffix) insertions + # Process entities in reverse order to preserve offsets + insertions = [] + for ent in entities: + o, l = _utf16_to_python_offsets(text, ent.offset, ent.length) + if isinstance(ent, MessageEntityBold): + insertions.append((o, l, "**", "**")) + elif isinstance(ent, MessageEntityItalic): + insertions.append((o, l, "_", "_")) + elif isinstance(ent, MessageEntityStrike): + insertions.append((o, l, "~~", "~~")) + elif isinstance(ent, MessageEntityCode): + insertions.append((o, l, "`", "`")) + elif isinstance(ent, MessageEntityPre): + lang = getattr(ent, "language", "") or "" + insertions.append((o, l, f"```{lang}\n", "\n```")) + elif isinstance(ent, MessageEntityTextUrl): + url = ent.url or "" + insertions.append((o, l, "[", f"]({url})")) + elif isinstance(ent, MessageEntityMentionName): + user_id = ent.user_id + insertions.append((o, l, "[", f"](tg://user?id={user_id})")) + # MessageEntityUrl, MessageEntityMention, MessageEntityHashtag — keep as-is (already visible in text) + + # Sort by offset descending so we can modify string from the end + insertions.sort(key=lambda x: x[0], reverse=True) + + result = text + for offset, length, prefix, suffix in insertions: + result = result[:offset] + prefix + result[offset:offset + length] + suffix + result[offset + length:] + + return result + + def format_entity(entity) -> Dict[str, Any]: """Helper function to format entity information consistently.""" result = {"id": entity.id} @@ -315,7 +392,7 @@ def format_message(message) -> Dict[str, Any]: result = { "id": message.id, "date": message.date.isoformat(), - "text": message.message or "", + "text": message_to_markdown(message), } if message.from_id: @@ -415,7 +492,7 @@ async def get_messages(chat_id: Union[int, str], page: int = 1, page_size: int = engagement_info = get_engagement_info(msg) lines.append( - f"ID: {msg.id} | {sender_name} | Date: {msg.date}{reply_info}{engagement_info} | Message: {msg.message}" + f"ID: {msg.id} | {sender_name} | Date: {msg.date}{reply_info}{engagement_info} | Message: {message_to_markdown(msg)}" ) return "\n".join(lines) except Exception as e: @@ -840,7 +917,7 @@ async def list_messages( lines = [] for msg in messages: sender_name = get_sender_name(msg) - message_text = msg.message or "[Media/No text]" + message_text = message_to_markdown(msg) or "[Media/No text]" reply_info = "" if msg.reply_to and msg.reply_to.reply_to_msg_id: reply_info = f" | reply to {msg.reply_to.reply_to_msg_id}" @@ -1068,7 +1145,7 @@ async def get_chat(chat_id: Union[int, str]) -> str: sender_name += f" {last_msg.sender.last_name}" sender_name = sender_name.strip() or "Unknown" result.append(f"Last Message: From {sender_name} at {last_msg.date}") - result.append(f"Message: {last_msg.message or '[Media/No text]'}") + result.append(f"Message: {message_to_markdown(last_msg) or '[Media/No text]'}") except Exception as diag_ex: logger.warning(f"Could not get dialog info for {chat_id}: {diag_ex}") pass @@ -1225,7 +1302,7 @@ async def get_last_interaction(contact_id: Union[int, str]) -> str: for msg in messages: sender = "You" if msg.out else contact_name - message_text = msg.message or "[Media/No text]" + message_text = message_to_markdown(msg) or "[Media/No text]" results.append(f"Date: {msg.date}, From: {sender}, Message: {message_text}") return "\n".join(results) @@ -1282,14 +1359,14 @@ async def get_message_context( replied_sender = getattr( replied_msg.sender, "first_name", "" ) or getattr(replied_msg.sender, "title", "Unknown") - reply_content = f" | reply to {msg.reply_to.reply_to_msg_id}\n → Replied message: [{replied_sender}] {replied_msg.message or '[Media/No text]'}" + reply_content = f" | reply to {msg.reply_to.reply_to_msg_id}\n → Replied message: [{replied_sender}] {message_to_markdown(replied_msg) or '[Media/No text]'}" except Exception: reply_content = ( f" | reply to {msg.reply_to.reply_to_msg_id} (original message not found)" ) results.append( - f"ID: {msg.id} | {sender_name} | {msg.date}{highlight}{reply_content}\n{msg.message or '[Media/No text]'}\n" + f"ID: {msg.id} | {sender_name} | {msg.date}{highlight}{reply_content}\n{message_to_markdown(msg) or '[Media/No text]'}\n" ) return "\n".join(results) except Exception as e: @@ -2830,7 +2907,7 @@ async def search_messages(chat_id: Union[int, str], query: str, limit: int = 20) if msg.reply_to and msg.reply_to.reply_to_msg_id: reply_info = f" | reply to {msg.reply_to.reply_to_msg_id}" lines.append( - f"ID: {msg.id} | {sender_name} | Date: {msg.date}{reply_info} | Message: {msg.message}" + f"ID: {msg.id} | {sender_name} | Date: {msg.date}{reply_info} | Message: {message_to_markdown(msg)}" ) return "\n".join(lines) except Exception as e: @@ -3208,7 +3285,7 @@ async def get_history(chat_id: Union[int, str], limit: int = 100) -> str: if msg.reply_to and msg.reply_to.reply_to_msg_id: reply_info = f" | reply to {msg.reply_to.reply_to_msg_id}" lines.append( - f"ID: {msg.id} | {sender_name} | Date: {msg.date}{reply_info} | Message: {msg.message}" + f"ID: {msg.id} | {sender_name} | Date: {msg.date}{reply_info} | Message: {message_to_markdown(msg)}" ) return "\n".join(lines) except Exception as e: @@ -3311,7 +3388,7 @@ async def get_pinned_messages(chat_id: Union[int, str]) -> str: if msg.reply_to and msg.reply_to.reply_to_msg_id: reply_info = f" | reply to {msg.reply_to.reply_to_msg_id}" lines.append( - f"ID: {msg.id} | {sender_name} | Date: {msg.date}{reply_info} | Message: {msg.message or '[Media/No text]'}" + f"ID: {msg.id} | {sender_name} | Date: {msg.date}{reply_info} | Message: {message_to_markdown(msg) or '[Media/No text]'}" ) return "\n".join(lines)